# Load packages
import pandas as pd
import numpy as np
import pandas_profiling
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
# Load training data to pandas
df = pd.read_csv('training_data.csv')
df.head()
| formulaA | formulaB | formulaA_elements_AtomicVolume | formulaB_elements_AtomicVolume | formulaA_elements_AtomicWeight | formulaB_elements_AtomicWeight | formulaA_elements_BoilingT | formulaB_elements_BoilingT | formulaA_elements_BulkModulus | formulaB_elements_BulkModulus | ... | formulaB_elements_Row | formulaA_elements_ShearModulus | formulaB_elements_ShearModulus | formulaA_elements_SpaceGroupNumber | formulaB_elements_SpaceGroupNumber | avg_coordination_A | avg_coordination_B | avg_nearest_neighbor_distance_A | avg_nearest_neighbor_distance_B | stabilityVec | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Ac | Ag | 37.433086 | 17.075648 | 227.0 | 107.868200 | 3473.0 | 2435.0 | 0.0 | 100.0 | ... | 5 | 0.0 | 30.0 | 225 | 225 | 12.0 | 12.0 | 3.99462 | 2.94195 | [1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0] |
| 1 | Ac | Al | 37.433086 | 16.594425 | 227.0 | 26.981539 | 3473.0 | 2792.0 | 0.0 | 76.0 | ... | 3 | 0.0 | 26.0 | 225 | 225 | 12.0 | 12.0 | 3.99462 | 2.85595 | [1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0] |
| 2 | Ac | As | 37.433086 | 21.723966 | 227.0 | 74.921600 | 3473.0 | 887.0 | 0.0 | 22.0 | ... | 4 | 0.0 | 0.0 | 225 | 166 | 12.0 | 3.0 | 3.99462 | 2.55790 | [1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0] |
| 3 | Ac | Ba | 37.433086 | 64.969282 | 227.0 | 137.327000 | 3473.0 | 2143.0 | 0.0 | 9.6 | ... | 6 | 0.0 | 4.9 | 225 | 229 | 12.0 | 8.0 | 3.99462 | 4.35637 | [1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0] |
| 4 | Ac | Bi | 37.433086 | 35.483459 | 227.0 | 208.980400 | 3473.0 | 1837.0 | 0.0 | 31.0 | ... | 6 | 0.0 | 12.0 | 225 | 12 | 12.0 | 3.0 | 3.99462 | 3.11221 | [1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0] |
5 rows × 99 columns
# Check if there is any missing data using df.info() and df.describe()
df.describe()
#df.info()
| formulaA_elements_AtomicVolume | formulaB_elements_AtomicVolume | formulaA_elements_AtomicWeight | formulaB_elements_AtomicWeight | formulaA_elements_BoilingT | formulaB_elements_BoilingT | formulaA_elements_BulkModulus | formulaB_elements_BulkModulus | formulaA_elements_Column | formulaB_elements_Column | ... | formulaA_elements_Row | formulaB_elements_Row | formulaA_elements_ShearModulus | formulaB_elements_ShearModulus | formulaA_elements_SpaceGroupNumber | formulaB_elements_SpaceGroupNumber | avg_coordination_A | avg_coordination_B | avg_nearest_neighbor_distance_A | avg_nearest_neighbor_distance_B | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2572.000000 | 2572.000000 | 2572.000000 | 2572.000000 | 2572.000000 | 2572.000000 | 2572.000000 | 2572.000000 | 2572.000000 | 2572.000000 | ... | 2572.000000 | 2572.000000 | 2572.000000 | 2572.000000 | 2572.000000 | 2572.000000 | 2572.000000 | 2572.000000 | 2572.000000 | 2572.000000 |
| mean | 2207.340923 | 2220.778005 | 112.319674 | 113.247322 | 2733.916283 | 2740.693188 | 74.569868 | 78.194751 | 7.992224 | 8.064930 | ... | 4.844090 | 4.857698 | 34.256726 | 35.582387 | 187.127138 | 189.957232 | 9.152102 | 9.271954 | 3.114011 | 3.130684 |
| std | 8729.184304 | 8751.899407 | 65.258759 | 65.877000 | 1507.624155 | 1510.148266 | 93.757854 | 96.094178 | 5.496219 | 5.475384 | ... | 1.377499 | 1.373744 | 50.611912 | 51.760457 | 56.243399 | 53.868652 | 3.637761 | 3.597788 | 0.708516 | 0.716404 |
| min | 7.297767 | 7.297767 | 4.002602 | 4.002602 | 4.070000 | 4.070000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 2.000000 | 2.000000 | 1.000000 | 1.000000 | 1.424090 | 1.424090 |
| 25% | 15.858734 | 15.858734 | 55.845000 | 55.845000 | 1469.000000 | 1615.000000 | 6.300000 | 7.700000 | 3.000000 | 3.000000 | ... | 4.000000 | 4.000000 | 0.000000 | 0.000000 | 194.000000 | 194.000000 | 6.000000 | 8.000000 | 2.626730 | 2.665670 |
| 50% | 26.082658 | 26.966785 | 107.868200 | 107.868200 | 2973.000000 | 2973.000000 | 38.700000 | 41.000000 | 7.000000 | 7.000000 | ... | 5.000000 | 5.000000 | 18.000000 | 18.000000 | 194.000000 | 194.000000 | 12.000000 | 12.000000 | 2.949550 | 2.949550 |
| 75% | 34.784501 | 34.784501 | 164.930320 | 167.259000 | 3680.000000 | 3676.250000 | 110.000000 | 120.000000 | 13.000000 | 13.000000 | ... | 6.000000 | 6.000000 | 38.000000 | 38.000000 | 225.000000 | 225.000000 | 12.000000 | 12.000000 | 3.541010 | 3.560590 |
| max | 37236.035560 | 37236.035560 | 238.028910 | 238.028910 | 5869.000000 | 5869.000000 | 380.000000 | 380.000000 | 18.000000 | 18.000000 | ... | 7.000000 | 7.000000 | 222.000000 | 222.000000 | 229.000000 | 229.000000 | 12.000000 | 12.000000 | 5.323950 | 5.323950 |
8 rows × 96 columns
df.isnull().sum()
formulaA 0
formulaB 0
formulaA_elements_AtomicVolume 0
formulaB_elements_AtomicVolume 0
formulaA_elements_AtomicWeight 0
formulaB_elements_AtomicWeight 0
formulaA_elements_BoilingT 0
formulaB_elements_BoilingT 0
formulaA_elements_BulkModulus 0
formulaB_elements_BulkModulus 0
formulaA_elements_Column 0
formulaB_elements_Column 0
formulaA_elements_CovalentRadius 0
formulaB_elements_CovalentRadius 0
formulaA_elements_Density 0
formulaB_elements_Density 0
formulaA_elements_ElectronSurfaceDensityWS 0
formulaB_elements_ElectronSurfaceDensityWS 0
formulaA_elements_Electronegativity 0
formulaB_elements_Electronegativity 0
formulaA_elements_FirstIonizationEnergy 0
formulaB_elements_FirstIonizationEnergy 0
formulaA_elements_GSbandgap 0
formulaB_elements_GSbandgap 0
formulaA_elements_GSenergy_pa 0
formulaB_elements_GSenergy_pa 0
formulaA_elements_GSestBCClatcnt 0
formulaB_elements_GSestBCClatcnt 0
formulaA_elements_GSestFCClatcnt 0
formulaB_elements_GSestFCClatcnt 0
..
formulaB_elements_NdUnfilled 0
formulaA_elements_NdValence 0
formulaB_elements_NdValence 0
formulaA_elements_NfUnfilled 0
formulaB_elements_NfUnfilled 0
formulaA_elements_NfValence 0
formulaB_elements_NfValence 0
formulaA_elements_NpUnfilled 0
formulaB_elements_NpUnfilled 0
formulaA_elements_NpValence 0
formulaB_elements_NpValence 0
formulaA_elements_NsUnfilled 0
formulaB_elements_NsUnfilled 0
formulaA_elements_NsValence 0
formulaB_elements_NsValence 0
formulaA_elements_Number 0
formulaB_elements_Number 0
formulaA_elements_Polarizability 0
formulaB_elements_Polarizability 0
formulaA_elements_Row 0
formulaB_elements_Row 0
formulaA_elements_ShearModulus 0
formulaB_elements_ShearModulus 0
formulaA_elements_SpaceGroupNumber 0
formulaB_elements_SpaceGroupNumber 0
avg_coordination_A 0
avg_coordination_B 0
avg_nearest_neighbor_distance_A 0
avg_nearest_neighbor_distance_B 0
stabilityVec 0
Length: 99, dtype: int64
# Find if there is any columns are highly correlated.
# pandas_profiling.ProfileReport(df)
# Drop 13 features which are highly correlated with another feature in dataset. Drop formulaA and formulaB.
df_new = df.copy()
df_new = df_new.drop(['formulaA_elements_GSbandgap','formulaA_elements_GSestBCClatcnt', 'formulaA_elements_GSvolume_pa',
'formulaA_elements_ICSDVolume','formulaA_elements_Column', 'formulaA_elements_Number',
'formulaA_elements_Row','formulaB_elements_GSestBCClatcnt', 'formulaB_elements_GSvolume_pa',
'formulaB_elements_ICSDVolume','formulaB_elements_Column', 'formulaB_elements_Number',
'formulaB_elements_Row', 'formulaA', 'formulaB'], axis = 1)
# Step 1. Transform the stability vector [1,0,....,1] into 11 separated columns/classes.
# Step 2. For each class, the composition ratio of A:B can be considered as 100, 90, 80, ..., 0.
# Extract the ratio of A:B to a separate column and add it to feature dataset (X)
# Step 3. Aggregate data to binary classification: generate 'label' with 1 or 0 to each sample.
# Step 4. Output three data sets: df contains input X and 11 classes; y_data contains 11 separate classes;
# dfc contains input X plus one A:B composition and 1 binary classification label.
def binary_composition(df):
#Transform the stablility vector [1,0,....,1] into 11 separated classes.
df['temp'] = df['stabilityVec'].apply(lambda x: x[1:-1].split(','))
for i in range(len(df['temp'][0])):
df['class_' + str(i)] = df['temp'].apply(lambda x: x[i])
df['class_' + str(i)] = df['class_' + str(i)].astype(np.float)
df = df.drop(['temp', 'stabilityVec'], axis = 1)
x_data = df.iloc[:, : -11]
y_data = df.iloc[:, -11:]
y_data = y_data.astype(np.int) #y_data.shape = n samples * 11 classes
#Extract A/B composition ratio as a separate feature.
dfc = x_data.copy()
for j, col in enumerate(y_data.columns):
dfb = x_data.copy()
dfb[col] = y_data[col]
dfb['B_ratio'] = j * 10
#generate 'label' with 1 or 0 to each sample
dfb.rename(columns = {col: 'label'}, inplace=True)
dfc = pd.concat([dfb, dfc],ignore_index=True)
dfc.dropna(inplace=True)
dfc['label'] = dfc['label'].astype(np.int)
# df: 11 classes are separated; y_data: 11 columns represents 11 classes; dfc: binary class labels.
return df, y_data, dfc
# df_ml will be used in the 2nd approach for multi-label classification.
# y_class has separted 11 columns as 11 classes.
# df_bc will be used in the 1st approach which is a pure binary classification case.
df_ml, y_class, df_bc = binary_composition(df_new)
print(df_ml.shape) # for multi-label classification
print(y_class.shape)
print(df_bc.shape) # for binary classification
(2572, 94) (2572, 11) (28292, 85)
# A glance of the processed data.
df_bc.head()
| B_ratio | avg_coordination_A | avg_coordination_B | avg_nearest_neighbor_distance_A | avg_nearest_neighbor_distance_B | formulaA_elements_AtomicVolume | formulaA_elements_AtomicWeight | formulaA_elements_BoilingT | formulaA_elements_BulkModulus | formulaA_elements_CovalentRadius | ... | formulaB_elements_NfUnfilled | formulaB_elements_NfValence | formulaB_elements_NpUnfilled | formulaB_elements_NpValence | formulaB_elements_NsUnfilled | formulaB_elements_NsValence | formulaB_elements_Polarizability | formulaB_elements_ShearModulus | formulaB_elements_SpaceGroupNumber | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100.0 | 12.0 | 12.0 | 3.99462 | 2.94195 | 37.433086 | 227.0 | 3473.0 | 0.0 | 215 | ... | 0 | 0 | 0 | 0 | 1 | 1 | 6.99 | 30.0 | 225 | 1 |
| 1 | 100.0 | 12.0 | 12.0 | 3.99462 | 2.85595 | 37.433086 | 227.0 | 3473.0 | 0.0 | 215 | ... | 0 | 0 | 5 | 1 | 0 | 2 | 6.80 | 26.0 | 225 | 1 |
| 2 | 100.0 | 12.0 | 3.0 | 3.99462 | 2.55790 | 37.433086 | 227.0 | 3473.0 | 0.0 | 215 | ... | 0 | 0 | 3 | 3 | 0 | 2 | 4.31 | 0.0 | 166 | 1 |
| 3 | 100.0 | 12.0 | 8.0 | 3.99462 | 4.35637 | 37.433086 | 227.0 | 3473.0 | 0.0 | 215 | ... | 0 | 0 | 0 | 0 | 0 | 2 | 39.70 | 4.9 | 229 | 1 |
| 4 | 100.0 | 12.0 | 3.0 | 3.99462 | 3.11221 | 37.433086 | 227.0 | 3473.0 | 0.0 | 215 | ... | 0 | 14 | 3 | 3 | 0 | 2 | 0.40 | 12.0 | 12 | 1 |
5 rows × 85 columns
# All 11 classes histogram
y_class.hist(figsize = (12, 12));
positive_ratio = len(df_bc[df_bc['label'] == 1])/len(df_bc) * 100
print('{}% data is positive.'.format(np.round(positive_ratio), 3))
27.0% data is positive.
# 9 classes correlation coefficients showed all classes are not correlated
colormap = plt.cm.plasma
plt.figure(figsize=(8,8))
plt.title('Correlation of Class Labels',y=1,size=14)
sns.heatmap(y_class.iloc[:,1:10].astype(float).corr(),linewidths=0.1,vmax=1.0,square=True,cmap=colormap,
linecolor='white',annot=True);
# pandas_profiling.ProfileReport(df_bc)
The dataset has 84 features. Thus, I use PCA to transform this high dimensional data to lower dimensional space to visualize the variation and check if I can select a few features for model prediction.
As indicated from the following figure, the first principal component only accounts for 10 percent of the variance. Also, we can see that the first ten principal components combined explain around 60 percent of the variance in the data.
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
def pca(X_data):
pca = PCA(n_components = 20)
# nomralize the feature scales before applying PCA
scaler = StandardScaler()
X_data_std = scaler.fit_transform(X_data)
pca.fit_transform(X_data_std)
ratio = pca.explained_variance_ratio_
plt.figure(figsize = (6,4))
plt.bar(range(1,21), ratio, alpha=0.5, align='center')
plt.step(range(1,21), np.cumsum(ratio), color = 'orange', where='mid')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.show()
X = df_bc.drop(['label'], axis = 1)
pca(X)
# Prepare training dataset with features (X) and labels (Y) for binary classification
X = df_bc.drop(['label'], axis = 1)
y = df_bc['label']
print('No. of features: ',len(X.columns))
print('X dimension:', X.shape)
print('y dimension:', y.shape)
No. of features: 84 X dimension: (28292, 84) y dimension: (28292,)
Since data is imbalanced, I add 'class_weight = 'balanced'' into traing algorithms.
Logistic regression did not perform well because it does not handle high dimentional data very well and plus the data is not linearly sparatable.
For SVM, there are two kernels to use, 'linear' Vs. 'rbf'. Since we have high dimentional features and PCA already showed the high variance accross all features, thus we could understand the linear kernel probably will not work well. 'rbf' is a gaussian kernel which map the original features to a different feature space using a gaussian function. There are two parameters to tune here. 'C' is the regularization parameter and 'gamma' is the kernel parameter which represents how wide the gaussian kernel. Usually large C indicates less regularization and tends to overfit the training data; large gamma indicates narrower guassian kernel and tends to overfit the training data. Since SVM showed promising performance compared to RF and GBDT, I will do a fine hyperparameter tunning for SVM to get the best model.
Random forest outperformed SVM. We can get feature importance from this model and usually there is no need to fine tune the hyperparameters. It is less prone to overfit the data since it is an ensemble method.
Gradient boosting decision trees performed the best with high F1 score. The algorithm is built upon a series of small decision trees and each tree is attempted to correct errors from the earlier stage. The hyperparameters includes no. of trees, learning rates and max depth. Learning rate controls how hard each new tree tries to correct remaining mistakes from previous round. Usually high learning rates means more complex trees and low learning rate means simpler trees. The model does require a lot of computation compared to other models and it will not perform well on high dimensional sparse feature. And this could be the reason why it did not perform as well as random forest in the 2nd approach where I have to train each class as a separate classifier.
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import auc, roc_curve, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.base import clone
from sklearn.model_selection import StratifiedKFold
# Construct the binary classification class to test on different models:
# Step 1: initialize global parameters
# Step 2: partition the train and test set with 3:1 raio and all samples should be randomized before split
# Step 3: standardize the features (both train and test) using StandardScaler which standardizes each features
# to gaussian distribution with mean at 0 and unit variance.
# Step 4: Fit the training data and use F1 score as metric.
class BinaryClassification():
def __init__(self, estimator, scoring = [precision_score, recall_score,f1_score],
test_size = 0.25, random_state = 1):
self.scoring = scoring
self.estimator = clone(estimator)
self.test_size = test_size
self.random_state = random_state
def standardization(self, X_train, X_test):
# Normalize data into normal distributed with mean at 0 and unit variance.
scaler = StandardScaler()
# Only fit on the training data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
return X_train_scaled, X_test_scaled
def fit(self, X, y):
# Split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size = self.test_size,random_state=self.random_state,shuffle = True,stratify = y)
X_train_scaled, X_test_scaled = self.standardization(X_train, X_test)
scores = self._calc_score(X_train_scaled, y_train,
X_test_scaled, y_test)
self.scores_ = {'Precision': np.round(scores[0], 3),'Recall': np.round(scores[1], 3),
'F1': np.round(scores[2], 3)}
return self
def _calc_score(self, X_train_scaled, y_train, X_test_scaled, y_test):
# fit training data
self.estimator.fit(X_train_scaled, y_train)
# predict the test set
y_pred = self.estimator.predict(X_test_scaled)
# record precision, recall, F1 scores
scores = []
for score_type in self.scoring:
score = score_type(y_test, y_pred)
scores.append(score)
return scores
def roc_cm(self, X, y):
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size = self.test_size,random_state=self.random_state,shuffle = True,stratify = y)
X_train_scaled, X_test_scaled = self.standardization(X_train, X_test)
self.estimator.fit(X_train_scaled, y_train)
y_pred_rf = self.estimator.predict_proba(X_test_scaled)
y_pred = self.estimator.predict(X_test_scaled)
# get fpr, tpr and thresholds to draw ROC curve
fpr, tpr, thresholds = roc_curve(y_test,y_pred_rf[:, 1],pos_label=1)
roc_auc = auc(fpr,tpr)
# Plot ROC
plt.figure(figsize = (6, 4))
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b',label='AUC = %0.3f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.0])
plt.ylim([-0.1,1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
# Plot Confusion Matrix
confusionMatrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
fig, ax = plt.subplots(figsize=(3, 3))
ax.matshow(confusionMatrix, cmap=plt.cm.Blues, alpha=0.3)
for i in range(confusionMatrix.shape[0]):
for j in range(confusionMatrix.shape[1]):
ax.text(x=j, y=i, s=confusionMatrix[i, j], va='center', ha='center')
plt.title('Confusion Matrix', y = 1.2)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.show()
# Logistic Regression: Add regularization term L1 with C 0.1 here. Other parameters were tested but no improvement
lr = LogisticRegression(penalty = 'l1', C = 0.1,class_weight='balanced')
lr_clf = BinaryClassification(lr)
lr_clf.fit(X, y)
print('Logistic Regression:', lr_clf.scores_)
Logistic Regression: {'Precision': 0.329, 'Recall': 0.592, 'F1': 0.423}
# SVM with rbf kernel: Improved performance based on LR and a little worse than RF.
# Next step: fine tune hyperparameters and find out the best SVM model.
svm = SVC(kernel = 'rbf', C = 1, gamma=0.01,class_weight='balanced')
svm_clf = BinaryClassification(svm)
svm_clf.fit(X, y)
print('SVM:', svm_clf.scores_)
SVM: {'Precision': 0.423, 'Recall': 0.61, 'F1': 0.5}
# Random forest showed good F1 score
forest = RandomForestClassifier(n_estimators=500, class_weight='balanced')
forest_clf = BinaryClassification(forest)
forest_clf.fit(X, y)
print('Random Forest:',forest_clf.scores_)
Random Forest: {'Precision': 0.723, 'Recall': 0.692, 'F1': 0.707}
# GBDT showed best F1 score so far
# Next step: fine tune hyperparameters
GBDT = GradientBoostingClassifier(n_estimators=500)
GBDT_clf = BinaryClassification(GBDT)
GBDT_clf.fit(X, y)
print('Gradient Boosting Decision Trees:',GBDT_clf.scores_)
Gradient Boosting Decision Trees: {'Precision': 0.938, 'Recall': 0.783, 'F1': 0.853}
from sklearn.model_selection import StratifiedKFold
# Goal: Find best hyperparameter for SVM
# Step1: standardize both traing and test data
# Step2: for different combonination of C, gamma and kernel, print all precicision and recall
# Step3: find the best parameter for SVM.
class modelSelection_SVM():
def __init__(self, scoring = [precision_score, recall_score,f1_score], test_size = 0.25, random_state =1):
self.scoring = scoring
self.test_size = test_size
self.random_state = random_state
def standardization(self, X_train, X_test):
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
return X_train_scaled, X_test_scaled
def gridSearch(self, X, y, C = [0.001, 0.01, 0.1, 1, 10, 100], gamma = [0.001, 0.01, 0.1, 1, 10, 100]):
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size = 0.25, random_state = 1, shuffle = True,stratify = y)
X_train_scaled, X_test_scaled = self.standardization(X_train, X_test)
# grid serach all C and gamma combinations: 6 x 6 = 36 fittings
for c in C:
for g in gamma:
clf = SVC(C = c, gamma = g, class_weight='balanced', kernel = 'rbf')
clf.fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print('C: %.3f, gamma: %.3f, precision: %.3f, recall: %.3f, F1: %.3f' %
(c, g, precision, recall, f1))
def kFold(self, X, y, cv, c = None, gamma = None):
# randomize the sample
m = np.random.permutation(X.index)
X = X.reindex(m)
y = y.reindex(m)
# split X and y into k fold for test and validation
kfold = list(StratifiedKFold(n_splits=cv, random_state=1).split(X, y))
# store scores into dictionary
scores = {i+1: [] for i in range(cv)}
for k, (train, test) in enumerate(kfold):
best_estimator = SVC(C = c, gamma = gamma, class_weight='balanced', kernel = 'rbf')
X_train, X_test = self.standardization(X.iloc[train], X.iloc[test])
clf = best_estimator.fit(X_train, y.iloc[train])
y_pred = clf.predict(X_test)
for score in self.scoring:
scores[k+1].append(score(y.iloc[test], y_pred))
print('Fold: %2d, Class dist.: %s, precision: %.3f, recall: %.3f, F1: %.3f' % (k+1,
np.bincount(y.iloc[train]), scores[k+1][0], scores[k+1][1], scores[k+1][2]))
print('\nCV precision: %.3f +/- %.3f' % (np.mean([scores[i+1][0] for i in range(cv)]),
np.std([scores[i+1][0] for i in range(cv)])))
print('CV recall: %.3f +/- %.3f' % (np.mean([scores[i+1][1] for i in range(cv)]),
np.std([scores[i+1][1] for i in range(cv)])))
print('CV F1: %.3f +/- %.3f' % (np.mean([scores[i+1][2] for i in range(cv)]),
np.std([scores[i+1][2] for i in range(cv)])))
def ROC(self, X, y, c=None, gamma=None):
best_estimator = SVC(C = c, gamma = gamma, class_weight='balanced', kernel = 'rbf')
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size = 0.25, random_state = 1, shuffle = True,stratify = y)
X_train_scaled, X_test_scaled = self.standardization(X_train, X_test)
best_estimator.fit(X_train_scaled, y_train)
y_pred_score = best_estimator.decision_function(X_test_scaled)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_score, pos_label=1 )
roc_auc = auc(fpr,tpr)
# Plot ROC
plt.figure(figsize = (6, 4))
plt.title('Receiver Operating Characteristic--SVM')
plt.plot(fpr, tpr, 'b',label='AUC = %0.3f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([0,1.0])
plt.ylim([-0.1,1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
model = modelSelection_SVM()
model.gridSearch(X, y)
C: 0.001, gamma: 0.001, precision: 0.270, recall: 1.000, F1: 0.425 C: 0.001, gamma: 0.010, precision: 0.270, recall: 1.000, F1: 0.425 C: 0.001, gamma: 0.100, precision: 0.270, recall: 1.000, F1: 0.425 C: 0.001, gamma: 1.000, precision: 0.270, recall: 1.000, F1: 0.425 C: 0.001, gamma: 10.000, precision: 0.270, recall: 1.000, F1: 0.425 C: 0.001, gamma: 100.000, precision: 0.270, recall: 1.000, F1: 0.425 C: 0.010, gamma: 0.001, precision: 0.270, recall: 1.000, F1: 0.425 C: 0.010, gamma: 0.010, precision: 0.325, recall: 0.683, F1: 0.440 C: 0.010, gamma: 0.100, precision: 0.270, recall: 1.000, F1: 0.425 C: 0.010, gamma: 1.000, precision: 0.270, recall: 1.000, F1: 0.425 C: 0.010, gamma: 10.000, precision: 0.270, recall: 1.000, F1: 0.425 C: 0.010, gamma: 100.000, precision: 0.270, recall: 1.000, F1: 0.425 C: 0.100, gamma: 0.001, precision: 0.320, recall: 0.651, F1: 0.429 C: 0.100, gamma: 0.010, precision: 0.344, recall: 0.644, F1: 0.448 C: 0.100, gamma: 0.100, precision: 0.324, recall: 0.559, F1: 0.410 C: 0.100, gamma: 1.000, precision: 0.279, recall: 0.580, F1: 0.377 C: 0.100, gamma: 10.000, precision: 0.270, recall: 1.000, F1: 0.425 C: 0.100, gamma: 100.000, precision: 0.270, recall: 1.000, F1: 0.425 C: 1.000, gamma: 0.001, precision: 0.332, recall: 0.650, F1: 0.439 C: 1.000, gamma: 0.010, precision: 0.423, recall: 0.610, F1: 0.500 C: 1.000, gamma: 0.100, precision: 0.309, recall: 0.438, F1: 0.362 C: 1.000, gamma: 1.000, precision: 0.223, recall: 0.313, F1: 0.260 C: 1.000, gamma: 10.000, precision: 0.186, recall: 0.090, F1: 0.121 C: 1.000, gamma: 100.000, precision: 0.000, recall: 0.000, F1: 0.000 C: 10.000, gamma: 0.001, precision: 0.365, recall: 0.627, F1: 0.461 C: 10.000, gamma: 0.010, precision: 0.529, recall: 0.730, F1: 0.613 C: 10.000, gamma: 0.100, precision: 0.327, recall: 0.419, F1: 0.367 C: 10.000, gamma: 1.000, precision: 0.249, recall: 0.198, F1: 0.220 C: 10.000, gamma: 10.000, precision: 0.176, recall: 0.074, F1: 0.105 C: 10.000, gamma: 100.000, precision: 0.000, recall: 0.000, F1: 0.000 C: 100.000, gamma: 0.001, precision: 0.533, recall: 0.736, F1: 0.618 C: 100.000, gamma: 0.010, precision: 0.578, recall: 0.725, F1: 0.643 C: 100.000, gamma: 0.100, precision: 0.410, recall: 0.389, F1: 0.399 C: 100.000, gamma: 1.000, precision: 0.211, recall: 0.173, F1: 0.190 C: 100.000, gamma: 10.000, precision: 0.176, recall: 0.074, F1: 0.105 C: 100.000, gamma: 100.000, precision: 0.000, recall: 0.000, F1: 0.000
# K fold cross validataion showed the average F1 score is 0.637 for SVM. Till now, I found the
# best hyparameters for SVM, which is: C=100, gamma=0.01 and F1: 0.637 +/- 0.009
model = modelSelection_SVM()
model.kFold(X, y, cv = 5, c = 100.0, gamma = 0.01)
Fold: 1, Class dist.: [16524 6109], precision: 0.573, recall: 0.724, F1: 0.640 Fold: 2, Class dist.: [16524 6109], precision: 0.561, recall: 0.745, F1: 0.640 Fold: 3, Class dist.: [16524 6110], precision: 0.572, recall: 0.739, F1: 0.645 Fold: 4, Class dist.: [16524 6110], precision: 0.570, recall: 0.724, F1: 0.638 Fold: 5, Class dist.: [16524 6110], precision: 0.543, recall: 0.722, F1: 0.620 CV precision: 0.564 +/- 0.011 CV recall: 0.731 +/- 0.009 CV F1: 0.637 +/- 0.009
# Plot ROC curve and get AUC 0.823, not that bad. Ideally we want the blue curve to be as further away from the
# red curve as possible and best AUC score would be 1.0 in this case. Red curve represent random guess (50% chance)
model = modelSelection_SVM()
model.ROC(X, y, c = 100, gamma = 0.01 )
# Plot Random Froest ROC and Confusion Matrix
# AUC is improved and the ROC curve looks better than SVM
forest = RandomForestClassifier(n_estimators=1000, class_weight='balanced')
forest_clf = BinaryClassification(forest)
forest_clf.roc_cm(X, y)
# Build a grid search function to search for best hyperparameter for GBDT
def GBDT(n_trees , depth , rate , X, y):
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size = 0.25, random_state = 1, shuffle = True, stratify = y)
# store scores into dictionary
scores = {}
# grid search all combinations of no. of trees, learning rate and max depth: 2 x 4 x 3 = 24 fittings
for n in n_trees:
for r in rate:
for d in depth:
clf = GradientBoostingClassifier(n_estimators = n, max_depth = d, learning_rate = r)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
scores[(n, r, d)] = f1_score(y_test, y_pred)
#print(scores)
# initialize the maximum F1 score to 0
fmax = 0
best_para = []
# find the best F1 score out of the parameter grid
for k, v in scores.items():
if v > fmax:
fmax = v
best_para = [n, r, d]
print ('Gradient Boosting Classifier best hyperparameters: n_estimators: %d, learing_rate: %.2f,\
max_depth: %d, F1 Score: %.3f'% (best_para[0], best_para[1], best_para[2], fmax ))
GBDT(n_trees = [500, 1000], depth = [1,3,5,8], rate = [0.01, 0.1,1], X=X, y=y)
Gradient Boosting Classifier best hyperparameters: n_estimators: 1000, learing_rate: 1.00, max_depth: 8, F1 Score: 0.867
# Plot GBDT ROC and Confusion Matrix
# Till now, the model is fine tuned and best parameter for GBDT was found: n_estimators = 1000, max_depth = 5,
# learning_rate = 0.1. ROC curve looks very good with AUC 0.97.
gbdt = GradientBoostingClassifier(n_estimators = 1000, max_depth = 5, learning_rate = 0.1)
gbdt_clf = BinaryClassification(gbdt)
gbdt_clf.roc_cm(X, y)
# Load Test Data
test= pd.read_csv('test_data.csv')
test.head()
#test.info()
#test.isnull().sum()
| formulaA | formulaB | formulaA_elements_AtomicVolume | formulaB_elements_AtomicVolume | formulaA_elements_AtomicWeight | formulaB_elements_AtomicWeight | formulaA_elements_BoilingT | formulaB_elements_BoilingT | formulaA_elements_BulkModulus | formulaB_elements_BulkModulus | ... | formulaA_elements_Row | formulaB_elements_Row | formulaA_elements_ShearModulus | formulaB_elements_ShearModulus | formulaA_elements_SpaceGroupNumber | formulaB_elements_SpaceGroupNumber | avg_coordination_A | avg_coordination_B | avg_nearest_neighbor_distance_A | avg_nearest_neighbor_distance_B | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Ac | Be | 37.433086 | 8.098176 | 227.0 | 9.012182 | 3473.0 | 2743.0 | 0.0 | 130.0 | ... | 7 | 2 | 0.0 | 132.0 | 225 | 194 | 12.0 | 12.0 | 3.99462 | 2.20087 |
| 1 | Ac | Cd | 37.433086 | 21.580025 | 227.0 | 112.411000 | 3473.0 | 1040.0 | 0.0 | 42.0 | ... | 7 | 5 | 0.0 | 19.0 | 225 | 194 | 12.0 | 6.0 | 3.99462 | 3.00785 |
| 2 | Ac | Cs | 37.433086 | 117.456016 | 227.0 | 132.905452 | 3473.0 | 944.0 | 0.0 | 1.6 | ... | 7 | 6 | 0.0 | 0.0 | 225 | 229 | 12.0 | 8.0 | 3.99462 | 5.32395 |
| 3 | Ac | Ho | 37.433086 | 31.140380 | 227.0 | 164.930320 | 3473.0 | 2973.0 | 0.0 | 40.0 | ... | 7 | 6 | 0.0 | 26.0 | 225 | 194 | 12.0 | 12.0 | 3.99462 | 3.48112 |
| 4 | Ac | K | 37.433086 | 75.847865 | 227.0 | 39.098300 | 3473.0 | 1032.0 | 0.0 | 3.1 | ... | 7 | 4 | 0.0 | 1.3 | 225 | 229 | 12.0 | 8.0 | 3.99462 | 4.57083 |
5 rows × 98 columns
pandas_profiling.ProfileReport(test)
Dataset info
| Number of variables | 98 |
|---|---|
| Number of observations | 749 |
| Total Missing (%) | 0.0% |
| Total size in memory | 573.5 KiB |
| Average record size in memory | 784.1 B |
Variables types
| Numeric | 68 |
|---|---|
| Categorical | 2 |
| Boolean | 14 |
| Date | 0 |
| Text (Unique) | 0 |
| Rejected | 14 |
| Unsupported | 0 |
Warnings
formulaA has a high cardinality: 82 distinct values WarningformulaA_elements_BulkModulus has 133 / 17.8% zeros ZerosformulaA_elements_ElectronSurfaceDensityWS has 130 / 17.4% zeros ZerosformulaA_elements_Electronegativity has 28 / 3.7% zeros ZerosformulaA_elements_GSbandgap is highly correlated with formulaA_elements_AtomicVolume (ρ = 0.90555) RejectedformulaA_elements_GSestFCClatcnt is highly correlated with formulaA_elements_GSestBCClatcnt (ρ = 1) RejectedformulaA_elements_GSmagmom has 694 / 92.7% zeros ZerosformulaA_elements_GSvolume_pa is highly correlated with formulaA_elements_GSestFCClatcnt (ρ = 0.95989) RejectedformulaA_elements_HHIp has 94 / 12.6% zeros ZerosformulaA_elements_HHIr has 94 / 12.6% zeros ZerosformulaA_elements_HeatCapacityMass has 38 / 5.1% zeros ZerosformulaA_elements_HeatCapacityMolar has 38 / 5.1% zeros ZerosformulaA_elements_HeatFusion has 23 / 3.1% zeros ZerosformulaA_elements_ICSDVolume is highly correlated with formulaA_elements_GSvolume_pa (ρ = 0.91837) RejectedformulaA_elements_MeltingT has 9 / 1.2% zeros ZerosformulaA_elements_MendeleevNumber is highly correlated with formulaA_elements_Column (ρ = 0.93584) RejectedformulaA_elements_MiracleRadius has 78 / 10.4% zeros ZerosformulaA_elements_NUnfilled has 152 / 20.3% zeros ZerosformulaA_elements_NdUnfilled has 503 / 67.2% zeros ZerosformulaA_elements_NdValence has 286 / 38.2% zeros ZerosformulaA_elements_NfUnfilled has 599 / 80.0% zeros ZerosformulaA_elements_NfValence has 489 / 65.3% zeros ZerosformulaA_elements_NpUnfilled has 576 / 76.9% zeros ZerosformulaA_elements_NpValence has 534 / 71.3% zeros ZerosformulaA_elements_NsValence has 11 / 1.5% zeros ZerosformulaA_elements_Number is highly correlated with formulaA_elements_AtomicWeight (ρ = 0.99873) RejectedformulaA_elements_Row is highly correlated with formulaA_elements_Number (ρ = 0.95123) RejectedformulaA_elements_ShearModulus has 205 / 27.4% zeros ZerosformulaB has a high cardinality: 82 distinct values WarningformulaB_elements_BulkModulus has 146 / 19.5% zeros ZerosformulaB_elements_ElectronSurfaceDensityWS has 115 / 15.4% zeros ZerosformulaB_elements_Electronegativity has 28 / 3.7% zeros ZerosformulaB_elements_GSbandgap is highly correlated with formulaB_elements_AtomicVolume (ρ = 0.91711) RejectedformulaB_elements_GSestFCClatcnt is highly correlated with formulaB_elements_GSestBCClatcnt (ρ = 1) RejectedformulaB_elements_GSmagmom has 674 / 90.0% zeros ZerosformulaB_elements_GSvolume_pa is highly correlated with formulaB_elements_GSestFCClatcnt (ρ = 0.9581) RejectedformulaB_elements_HHIp has 93 / 12.4% zeros ZerosformulaB_elements_HHIr has 93 / 12.4% zeros ZerosformulaB_elements_HeatCapacityMass has 39 / 5.2% zeros ZerosformulaB_elements_HeatCapacityMolar has 39 / 5.2% zeros ZerosformulaB_elements_HeatFusion has 14 / 1.9% zeros ZerosformulaB_elements_ICSDVolume is highly correlated with formulaB_elements_GSestFCClatcnt (ρ = 0.92375) RejectedformulaB_elements_MendeleevNumber is highly correlated with formulaB_elements_Column (ρ = 0.93005) RejectedformulaB_elements_MiracleRadius has 75 / 10.0% zeros ZerosformulaB_elements_NUnfilled has 132 / 17.6% zeros ZerosformulaB_elements_NdUnfilled has 469 / 62.6% zeros ZerosformulaB_elements_NdValence has 256 / 34.2% zeros ZerosformulaB_elements_NfUnfilled has 633 / 84.5% zeros ZerosformulaB_elements_NfValence has 519 / 69.3% zeros ZerosformulaB_elements_NpUnfilled has 576 / 76.9% zeros ZerosformulaB_elements_NpValence has 532 / 71.0% zeros ZerosformulaB_elements_Number is highly correlated with formulaB_elements_AtomicWeight (ρ = 0.99874) RejectedformulaB_elements_Row is highly correlated with formulaB_elements_Number (ρ = 0.94875) RejectedformulaB_elements_ShearModulus has 226 / 30.2% zeros Zerosavg_coordination_A
Numeric
| Distinct count | 10 |
|---|---|
| Unique (%) | 1.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 9.1755 |
|---|---|
| Minimum | 1 |
| Maximum | 12 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 1 |
|---|---|
| 5-th percentile | 2 |
| Q1 | 8 |
| Median | 12 |
| Q3 | 12 |
| 95-th percentile | 12 |
| Maximum | 12 |
| Range | 11 |
| Interquartile range | 4 |
Descriptive statistics
| Standard deviation | 3.6532 |
|---|---|
| Coef of variation | 0.39815 |
| Kurtosis | -0.73353 |
| Mean | 9.1755 |
| MAD | 3.228 |
| Skewness | -0.86629 |
| Sum | 6872.4 |
| Variance | 13.346 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 12.0 | 428 | 57.1% |
|
| 8.0 | 137 | 18.3% |
|
| 4.0 | 46 | 6.1% |
|
| 3.0 | 43 | 5.7% |
|
| 2.0 | 34 | 4.5% |
|
| 6.0 | 23 | 3.1% |
|
| 1.0 | 18 | 2.4% |
|
| 5.0 | 10 | 1.3% |
|
| 5.24138 | 6 | 0.8% |
|
| 5.5 | 4 | 0.5% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 1.0 | 18 | 2.4% |
|
| 2.0 | 34 | 4.5% |
|
| 3.0 | 43 | 5.7% |
|
| 4.0 | 46 | 6.1% |
|
| 5.0 | 10 | 1.3% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 5.24138 | 6 | 0.8% |
|
| 5.5 | 4 | 0.5% |
|
| 6.0 | 23 | 3.1% |
|
| 8.0 | 137 | 18.3% |
|
| 12.0 | 428 | 57.1% |
|
avg_coordination_B
Numeric
| Distinct count | 10 |
|---|---|
| Unique (%) | 1.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 9.1788 |
|---|---|
| Minimum | 1 |
| Maximum | 12 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 1 |
|---|---|
| 5-th percentile | 2 |
| Q1 | 8 |
| Median | 12 |
| Q3 | 12 |
| 95-th percentile | 12 |
| Maximum | 12 |
| Range | 11 |
| Interquartile range | 4 |
Descriptive statistics
| Standard deviation | 3.6135 |
|---|---|
| Coef of variation | 0.39368 |
| Kurtosis | -0.74977 |
| Mean | 9.1788 |
| MAD | 3.2016 |
| Skewness | -0.8475 |
| Sum | 6874.9 |
| Variance | 13.058 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 12.0 | 425 | 56.7% |
|
| 8.0 | 139 | 18.6% |
|
| 4.0 | 52 | 6.9% |
|
| 3.0 | 42 | 5.6% |
|
| 2.0 | 25 | 3.3% |
|
| 1.0 | 19 | 2.5% |
|
| 6.0 | 19 | 2.5% |
|
| 5.0 | 12 | 1.6% |
|
| 5.24138 | 8 | 1.1% |
|
| 5.5 | 8 | 1.1% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 1.0 | 19 | 2.5% |
|
| 2.0 | 25 | 3.3% |
|
| 3.0 | 42 | 5.6% |
|
| 4.0 | 52 | 6.9% |
|
| 5.0 | 12 | 1.6% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 5.24138 | 8 | 1.1% |
|
| 5.5 | 8 | 1.1% |
|
| 6.0 | 19 | 2.5% |
|
| 8.0 | 139 | 18.6% |
|
| 12.0 | 425 | 56.7% |
|
avg_nearest_neighbor_distance_A
Numeric
| Distinct count | 82 |
|---|---|
| Unique (%) | 10.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 3.1848 |
|---|---|
| Minimum | 1.4241 |
| Maximum | 5.3239 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 1.4241 |
|---|---|
| 5-th percentile | 2.2762 |
| Q1 | 2.7024 |
| Median | 3.0078 |
| Q3 | 3.586 |
| 95-th percentile | 4.5937 |
| Maximum | 5.3239 |
| Range | 3.8999 |
| Interquartile range | 0.88366 |
Descriptive statistics
| Standard deviation | 0.71389 |
|---|---|
| Coef of variation | 0.22416 |
| Kurtosis | 0.50629 |
| Mean | 3.1848 |
| MAD | 0.57383 |
| Skewness | 0.66147 |
| Sum | 2385.4 |
| Variance | 0.50963 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 2.96771 | 15 | 2.0% |
|
| 4.89764 | 14 | 1.9% |
|
| 3.46096 | 14 | 1.9% |
|
| 2.62673 | 14 | 1.9% |
|
| 3.6390800000000003 | 14 | 1.9% |
|
| 2.70452 | 13 | 1.7% |
|
| 2.364 | 13 | 1.7% |
|
| 3.70191 | 13 | 1.7% |
|
| 4.5936900000000005 | 13 | 1.7% |
|
| 3.1122099999999997 | 13 | 1.7% |
|
| Other values (72) | 613 | 81.8% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 1.4240899999999999 | 8 | 1.1% |
|
| 1.7253599999999998 | 4 | 0.5% |
|
| 2.06242 | 11 | 1.5% |
|
| 2.20087 | 8 | 1.1% |
|
| 2.2137700000000002 | 2 | 0.3% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 4.57083 | 8 | 1.1% |
|
| 4.5936900000000005 | 13 | 1.7% |
|
| 4.85032 | 10 | 1.3% |
|
| 4.89764 | 14 | 1.9% |
|
| 5.32395 | 9 | 1.2% |
|
avg_nearest_neighbor_distance_B
Numeric
| Distinct count | 82 |
|---|---|
| Unique (%) | 10.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 3.1281 |
|---|---|
| Minimum | 1.4241 |
| Maximum | 5.3239 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 1.4241 |
|---|---|
| 5-th percentile | 2.2762 |
| Q1 | 2.6267 |
| Median | 2.9419 |
| Q3 | 3.5339 |
| 95-th percentile | 4.5937 |
| Maximum | 5.3239 |
| Range | 3.8999 |
| Interquartile range | 0.90718 |
Descriptive statistics
| Standard deviation | 0.71954 |
|---|---|
| Coef of variation | 0.23002 |
| Kurtosis | 0.93497 |
| Mean | 3.1281 |
| MAD | 0.5683 |
| Skewness | 0.96195 |
| Sum | 2342.9 |
| Variance | 0.51773 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 2.72536 | 17 | 2.3% |
|
| 5.32395 | 16 | 2.1% |
|
| 2.70452 | 14 | 1.9% |
|
| 2.66567 | 14 | 1.9% |
|
| 4.5936900000000005 | 14 | 1.9% |
|
| 2.94195 | 14 | 1.9% |
|
| 2.364 | 13 | 1.7% |
|
| 2.87565 | 13 | 1.7% |
|
| 2.87792 | 13 | 1.7% |
|
| 2.5571900000000003 | 13 | 1.7% |
|
| Other values (72) | 608 | 81.2% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 1.4240899999999999 | 3 | 0.4% |
|
| 1.7253599999999998 | 8 | 1.1% |
|
| 2.06242 | 8 | 1.1% |
|
| 2.20087 | 7 | 0.9% |
|
| 2.2137700000000002 | 11 | 1.5% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 4.57083 | 8 | 1.1% |
|
| 4.5936900000000005 | 14 | 1.9% |
|
| 4.85032 | 9 | 1.2% |
|
| 4.89764 | 7 | 0.9% |
|
| 5.32395 | 16 | 2.1% |
|
formulaA
Categorical
| Distinct count | 82 |
|---|---|
| Unique (%) | 10.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Li |
|
|---|---|
| Pm |
|
| Rb |
|
| Other values (79) |
706
|
| Value | Count | Frequency (%) | |
| Li | 15 | 2.0% |
|
| Pm | 14 | 1.9% |
|
| Rb | 14 | 1.9% |
|
| Er | 14 | 1.9% |
|
| Zn | 14 | 1.9% |
|
| Ti | 13 | 1.7% |
|
| Bi | 13 | 1.7% |
|
| Pr | 13 | 1.7% |
|
| Kr | 13 | 1.7% |
|
| Se | 13 | 1.7% |
|
| Other values (72) | 613 | 81.8% |
|
formulaA_elements_AtomicVolume
Numeric
| Distinct count | 82 |
|---|---|
| Unique (%) | 10.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 2555.8 |
|---|---|
| Minimum | 7.2978 |
| Maximum | 37236 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 7.2978 |
|---|---|
| 5-th percentile | 11.798 |
| Q1 | 17.076 |
| Median | 27.209 |
| Q3 | 35.239 |
| 95-th percentile | 37107 |
| Maximum | 37236 |
| Range | 37229 |
| Interquartile range | 18.164 |
Descriptive statistics
| Standard deviation | 9353.8 |
|---|---|
| Coef of variation | 3.6599 |
| Kurtosis | 9.8332 |
| Mean | 2555.8 |
| MAD | 4709.3 |
| Skewness | 3.4361 |
| Sum | 1914300 |
| Variance | 87494000 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 21.54405808 | 15 | 2.0% |
|
| 15.20568373 | 14 | 1.9% |
|
| 30.63606837 | 14 | 1.9% |
|
| 33.14754856 | 14 | 1.9% |
|
| 92.64095185 | 14 | 1.9% |
|
| 35.48345908 | 13 | 1.7% |
|
| 17.63631716 | 13 | 1.7% |
|
| 35.23917573 | 13 | 1.7% |
|
| 27.20880463 | 13 | 1.7% |
|
| 37107.494739999995 | 13 | 1.7% |
|
| Other values (72) | 613 | 81.8% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 7.297767265 | 4 | 0.5% |
|
| 8.098176455 | 8 | 1.1% |
|
| 8.825089715 | 8 | 1.1% |
|
| 10.94128444 | 5 | 0.7% |
|
| 10.99586068 | 5 | 0.7% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 36952.92402 | 10 | 1.3% |
|
| 37107.494739999995 | 13 | 1.7% |
|
| 37184.28542 | 10 | 1.3% |
|
| 37232.18569 | 9 | 1.2% |
|
| 37236.035560000004 | 9 | 1.2% |
|
formulaA_elements_AtomicWeight
Numeric
| Distinct count | 82 |
|---|---|
| Unique (%) | 10.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 112.55 |
|---|---|
| Minimum | 4.0026 |
| Maximum | 238.03 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 4.0026 |
|---|---|
| 5-th percentile | 12.011 |
| Q1 | 58.693 |
| Median | 112.41 |
| Q3 | 162.5 |
| 95-th percentile | 227 |
| Maximum | 238.03 |
| Range | 234.03 |
| Interquartile range | 103.81 |
Descriptive statistics
| Standard deviation | 64.594 |
|---|---|
| Coef of variation | 0.57392 |
| Kurtosis | -0.98954 |
| Mean | 112.55 |
| MAD | 55.055 |
| Skewness | 0.14007 |
| Sum | 84300 |
| Variance | 4172.4 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 6.941 | 15 | 2.0% |
|
| 145.0 | 14 | 1.9% |
|
| 65.38 | 14 | 1.9% |
|
| 167.25900000000001 | 14 | 1.9% |
|
| 85.4678 | 14 | 1.9% |
|
| 208.9804 | 13 | 1.7% |
|
| 83.79799999999999 | 13 | 1.7% |
|
| 47.867 | 13 | 1.7% |
|
| 140.90765 | 13 | 1.7% |
|
| 78.96 | 13 | 1.7% |
|
| Other values (72) | 613 | 81.8% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 4.0026019999999995 | 9 | 1.2% |
|
| 6.941 | 15 | 2.0% |
|
| 9.012182000000001 | 8 | 1.1% |
|
| 10.811 | 4 | 0.5% |
|
| 12.0107 | 8 | 1.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 227.0 | 9 | 1.2% |
|
| 231.03586 | 8 | 1.1% |
|
| 232.03806 | 10 | 1.3% |
|
| 237.0 | 10 | 1.3% |
|
| 238.02891 | 9 | 1.2% |
|
formulaA_elements_BoilingT
Numeric
| Distinct count | 79 |
|---|---|
| Unique (%) | 10.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 2590.9 |
|---|---|
| Minimum | 4.07 |
| Maximum | 5869 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 4.07 |
|---|---|
| 5-th percentile | 119.78 |
| Q1 | 1261 |
| Median | 2840 |
| Q3 | 3563 |
| 95-th percentile | 5093 |
| Maximum | 5869 |
| Range | 5864.9 |
| Interquartile range | 2302 |
Descriptive statistics
| Standard deviation | 1474.9 |
|---|---|
| Coef of variation | 0.56927 |
| Kurtosis | -0.81657 |
| Mean | 2590.9 |
| MAD | 1260.5 |
| Skewness | 0.062445 |
| Sum | 1940600 |
| Variance | 2175400 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 4273.0 | 22 | 2.9% |
|
| 1615.0 | 15 | 2.0% |
|
| 3200.0 | 15 | 2.0% |
|
| 1180.0 | 14 | 1.9% |
|
| 961.0 | 14 | 1.9% |
|
| 3141.0 | 14 | 1.9% |
|
| 3273.0 | 14 | 1.9% |
|
| 1837.0 | 13 | 1.7% |
|
| 119.78 | 13 | 1.7% |
|
| 958.0 | 13 | 1.7% |
|
| Other values (69) | 602 | 80.4% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 4.07 | 9 | 1.2% |
|
| 26.92 | 9 | 1.2% |
|
| 87.2 | 10 | 1.3% |
|
| 119.78 | 13 | 1.7% |
|
| 165.0 | 10 | 1.3% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 5093.0 | 10 | 1.3% |
|
| 5285.0 | 9 | 1.2% |
|
| 5731.0 | 11 | 1.5% |
|
| 5828.0 | 4 | 0.5% |
|
| 5869.0 | 5 | 0.7% |
|
formulaA_elements_BulkModulus
Numeric
| Distinct count | 49 |
|---|---|
| Unique (%) | 6.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 66.526 |
|---|---|
| Minimum | 0 |
| Maximum | 380 |
| Zeros (%) | 17.8% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 7.7 |
| Median | 38 |
| Q3 | 76 |
| 95-th percentile | 230 |
| Maximum | 380 |
| Range | 380 |
| Interquartile range | 68.3 |
Descriptive statistics
| Standard deviation | 85.971 |
|---|---|
| Coef of variation | 1.2923 |
| Kurtosis | 3.0767 |
| Mean | 66.526 |
| MAD | 63.957 |
| Skewness | 1.845 |
| Sum | 49828 |
| Variance | 7391 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 0.0 | 133 | 17.8% |
|
| 8.3 | 25 | 3.3% |
|
| 31.0 | 23 | 3.1% |
|
| 7.7 | 23 | 3.1% |
|
| 100.0 | 23 | 3.1% |
|
| 33.0 | 22 | 2.9% |
|
| 38.0 | 21 | 2.8% |
|
| 180.0 | 21 | 2.8% |
|
| 42.0 | 21 | 2.8% |
|
| 45.0 | 20 | 2.7% |
|
| Other values (39) | 417 | 55.7% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.0 | 133 | 17.8% |
|
| 1.6 | 9 | 1.2% |
|
| 1.9 | 6 | 0.8% |
|
| 2.5 | 14 | 1.9% |
|
| 3.1 | 8 | 1.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 230.0 | 19 | 2.5% |
|
| 310.0 | 4 | 0.5% |
|
| 320.0 | 9 | 1.2% |
|
| 370.0 | 5 | 0.7% |
|
| 380.0 | 12 | 1.6% |
|
formulaA_elements_Column
Numeric
| Distinct count | 18 |
|---|---|
| Unique (%) | 2.4% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 8.016 |
|---|---|
| Minimum | 1 |
| Maximum | 18 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 1 |
|---|---|
| 5-th percentile | 1 |
| Q1 | 3 |
| Median | 6 |
| Q3 | 13 |
| 95-th percentile | 18 |
| Maximum | 18 |
| Range | 17 |
| Interquartile range | 10 |
Descriptive statistics
| Standard deviation | 5.691 |
|---|---|
| Coef of variation | 0.70995 |
| Kurtosis | -1.374 |
| Mean | 8.016 |
| MAD | 5.1609 |
| Skewness | 0.38255 |
| Sum | 6004 |
| Variance | 32.387 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 3 | 211 | 28.2% |
|
| 1 | 56 | 7.5% |
|
| 18 | 51 | 6.8% |
|
| 2 | 46 | 6.1% |
|
| 14 | 44 | 5.9% |
|
| 13 | 42 | 5.6% |
|
| 15 | 35 | 4.7% |
|
| 12 | 34 | 4.5% |
|
| 16 | 34 | 4.5% |
|
| 8 | 28 | 3.7% |
|
| Other values (8) | 168 | 22.4% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 1 | 56 | 7.5% |
|
| 2 | 46 | 6.1% |
|
| 3 | 211 | 28.2% |
|
| 4 | 18 | 2.4% |
|
| 5 | 24 | 3.2% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 14 | 44 | 5.9% |
|
| 15 | 35 | 4.7% |
|
| 16 | 34 | 4.5% |
|
| 17 | 18 | 2.4% |
|
| 18 | 51 | 6.8% |
|
formulaA_elements_CovalentRadius
Numeric
| Distinct count | 57 |
|---|---|
| Unique (%) | 7.6% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 155.35 |
|---|---|
| Minimum | 28 |
| Maximum | 244 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 28 |
|---|---|
| 5-th percentile | 99.6 |
| Q1 | 132 |
| Median | 146 |
| Q3 | 192 |
| 95-th percentile | 215 |
| Maximum | 244 |
| Range | 216 |
| Interquartile range | 60 |
Descriptive statistics
| Standard deviation | 39.239 |
|---|---|
| Coef of variation | 0.25258 |
| Kurtosis | 0.25017 |
| Mean | 155.35 |
| MAD | 32.512 |
| Skewness | -0.29976 |
| Sum | 116360 |
| Variance | 1539.7 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 139 | 60 | 8.0% |
|
| 132 | 29 | 3.9% |
|
| 120 | 28 | 3.7% |
|
| 190 | 25 | 3.3% |
|
| 122 | 24 | 3.2% |
|
| 198 | 22 | 2.9% |
|
| 142 | 21 | 2.8% |
|
| 203 | 21 | 2.8% |
|
| 196 | 20 | 2.7% |
|
| 187 | 20 | 2.7% |
|
| Other values (47) | 479 | 64.0% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 28 | 9 | 1.2% |
|
| 58 | 9 | 1.2% |
|
| 76 | 8 | 1.1% |
|
| 84 | 4 | 0.5% |
|
| 96 | 8 | 1.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 206 | 10 | 1.3% |
|
| 207 | 7 | 0.9% |
|
| 215 | 18 | 2.4% |
|
| 220 | 14 | 1.9% |
|
| 244 | 9 | 1.2% |
|
formulaA_elements_Density
Numeric
| Distinct count | 80 |
|---|---|
| Unique (%) | 10.7% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 7417.9 |
|---|---|
| Minimum | 0.1785 |
| Maximum | 22590 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 0.1785 |
|---|---|
| 5-th percentile | 3.75 |
| Q1 | 2700 |
| Median | 7010 |
| Q3 | 9780 |
| 95-th percentile | 20450 |
| Maximum | 22590 |
| Range | 22590 |
| Interquartile range | 7080 |
Descriptive statistics
| Standard deviation | 5470.8 |
|---|---|
| Coef of variation | 0.73751 |
| Kurtosis | 0.61084 |
| Mean | 7417.9 |
| MAD | 4116.9 |
| Skewness | 0.92494 |
| Sum | 5556000 |
| Variance | 29930000 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 7140.0 | 23 | 3.1% |
|
| 7310.0 | 19 | 2.5% |
|
| 535.0 | 15 | 2.0% |
|
| 1532.0 | 14 | 1.9% |
|
| 9066.0 | 14 | 1.9% |
|
| 7264.0 | 14 | 1.9% |
|
| 6640.0 | 13 | 1.7% |
|
| 3.75 | 13 | 1.7% |
|
| 9780.0 | 13 | 1.7% |
|
| 4819.0 | 13 | 1.7% |
|
| Other values (70) | 598 | 79.8% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.1785 | 9 | 1.2% |
|
| 0.9 | 9 | 1.2% |
|
| 1.784 | 10 | 1.3% |
|
| 3.75 | 13 | 1.7% |
|
| 5.9 | 10 | 1.3% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 20450.0 | 10 | 1.3% |
|
| 21020.0 | 5 | 0.7% |
|
| 21090.0 | 11 | 1.5% |
|
| 22560.0 | 5 | 0.7% |
|
| 22590.0 | 9 | 1.2% |
|
formulaA_elements_ElectronSurfaceDensityWS
Numeric
| Distinct count | 50 |
|---|---|
| Unique (%) | 6.7% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 1.0894 |
|---|---|
| Minimum | 0 |
| Maximum | 1.85 |
| Zeros (%) | 17.4% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0.88 |
| Median | 1.22 |
| Q3 | 1.51 |
| 95-th percentile | 1.81 |
| Maximum | 1.85 |
| Range | 1.85 |
| Interquartile range | 0.63 |
Descriptive statistics
| Standard deviation | 0.57963 |
|---|---|
| Coef of variation | 0.53207 |
| Kurtosis | -0.40378 |
| Mean | 1.0894 |
| MAD | 0.45777 |
| Skewness | -0.83105 |
| Sum | 815.94 |
| Variance | 0.33597 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 0.0 | 130 | 17.4% |
|
| 1.21 | 42 | 5.6% |
|
| 1.24 | 40 | 5.3% |
|
| 1.77 | 24 | 3.2% |
|
| 1.2 | 22 | 2.9% |
|
| 1.23 | 22 | 2.9% |
|
| 1.17 | 21 | 2.8% |
|
| 1.22 | 21 | 2.8% |
|
| 1.67 | 19 | 2.5% |
|
| 1.83 | 16 | 2.1% |
|
| Other values (40) | 392 | 52.3% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.0 | 130 | 17.4% |
|
| 0.55 | 9 | 1.2% |
|
| 0.6 | 14 | 1.9% |
|
| 0.65 | 8 | 1.1% |
|
| 0.81 | 9 | 1.2% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 1.77 | 24 | 3.2% |
|
| 1.78 | 11 | 1.5% |
|
| 1.81 | 10 | 1.3% |
|
| 1.83 | 16 | 2.1% |
|
| 1.85 | 14 | 1.9% |
|
formulaA_elements_Electronegativity
Numeric
| Distinct count | 67 |
|---|---|
| Unique (%) | 8.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 1.606 |
|---|---|
| Minimum | 0 |
| Maximum | 3 |
| Zeros (%) | 3.7% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0.82 |
| Q1 | 1.185 |
| Median | 1.57 |
| Q3 | 2.05 |
| 95-th percentile | 2.6 |
| Maximum | 3 |
| Range | 3 |
| Interquartile range | 0.865 |
Descriptive statistics
| Standard deviation | 0.62669 |
|---|---|
| Coef of variation | 0.39022 |
| Kurtosis | 0.095818 |
| Mean | 1.606 |
| MAD | 0.50864 |
| Skewness | -0.13221 |
| Sum | 1202.9 |
| Variance | 0.39274 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 2.2 | 36 | 4.8% |
|
| 1.9 | 29 | 3.9% |
|
| 0.0 | 28 | 3.7% |
|
| 2.28 | 23 | 3.1% |
|
| 0.82 | 22 | 2.9% |
|
| 2.55 | 21 | 2.8% |
|
| 1.5 | 19 | 2.5% |
|
| 1.36 | 18 | 2.4% |
|
| 1.1 | 16 | 2.1% |
|
| 0.98 | 15 | 2.0% |
|
| Other values (57) | 522 | 69.7% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.0 | 28 | 3.7% |
|
| 0.79 | 9 | 1.2% |
|
| 0.82 | 22 | 2.9% |
|
| 0.89 | 9 | 1.2% |
|
| 0.93 | 10 | 1.3% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 2.58 | 11 | 1.5% |
|
| 2.6 | 10 | 1.3% |
|
| 2.66 | 12 | 1.6% |
|
| 2.96 | 6 | 0.8% |
|
| 3.0 | 13 | 1.7% |
|
formulaA_elements_FirstIonizationEnergy
Numeric
| Distinct count | 82 |
|---|---|
| Unique (%) | 10.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 7.7812 |
|---|---|
| Minimum | 3.8939 |
| Maximum | 24.587 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 3.8939 |
|---|---|
| 5-th percentile | 5.1391 |
| Q1 | 5.89 |
| Median | 7.0924 |
| Q3 | 8.6084 |
| 95-th percentile | 14 |
| Maximum | 24.587 |
| Range | 20.693 |
| Interquartile range | 2.7184 |
Descriptive statistics
| Standard deviation | 3.2519 |
|---|---|
| Coef of variation | 0.41792 |
| Kurtosis | 10.514 |
| Mean | 7.7812 |
| MAD | 2.0894 |
| Skewness | 2.8448 |
| Sum | 5828.1 |
| Variance | 10.575 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 5.391719 | 15 | 2.0% |
|
| 5.582000000000001 | 14 | 1.9% |
|
| 4.177128 | 14 | 1.9% |
|
| 6.1077 | 14 | 1.9% |
|
| 9.394199 | 14 | 1.9% |
|
| 9.75239 | 13 | 1.7% |
|
| 6.828119999999999 | 13 | 1.7% |
|
| 5.473 | 13 | 1.7% |
|
| 7.2855 | 13 | 1.7% |
|
| 13.99961 | 13 | 1.7% |
|
| Other values (72) | 613 | 81.8% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 3.893905 | 9 | 1.2% |
|
| 4.177128 | 14 | 1.9% |
|
| 4.3406633 | 8 | 1.1% |
|
| 5.139076 | 10 | 1.3% |
|
| 5.17 | 9 | 1.2% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 12.12984 | 10 | 1.3% |
|
| 13.99961 | 13 | 1.7% |
|
| 15.75961 | 10 | 1.3% |
|
| 21.56454 | 9 | 1.2% |
|
| 24.587387 | 9 | 1.2% |
|
formulaA_elements_GSbandgap
Highly correlated
This variable is highly correlated with formulaA_elements_AtomicVolume and should be ignored for analysis
| Correlation | 0.90555 |
|---|
formulaA_elements_GSenergy_pa
Numeric
| Distinct count | 82 |
|---|---|
| Unique (%) | 10.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | -4.6282 |
|---|---|
| Minimum | -12.959 |
| Maximum | 0.094036 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | -12.959 |
|---|---|
| 5-th percentile | -11.292 |
| Q1 | -6.2855 |
| Median | -4.4739 |
| Q3 | -1.8699 |
| 95-th percentile | 0.048731 |
| Maximum | 0.094036 |
| Range | 13.053 |
| Interquartile range | 4.4156 |
Descriptive statistics
| Standard deviation | 3.3166 |
|---|---|
| Coef of variation | -0.71661 |
| Kurtosis | -0.22431 |
| Mean | -4.6282 |
| MAD | 2.5565 |
| Skewness | -0.69865 |
| Sum | -3466.5 |
| Variance | 11 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| -1.86988691 | 15 | 2.0% |
|
| -4.744621343 | 14 | 1.9% |
|
| -0.963335025 | 14 | 1.9% |
|
| -1.239999915 | 14 | 1.9% |
|
| -4.563633745 | 14 | 1.9% |
|
| -4.77382996 | 13 | 1.7% |
|
| -3.9736943999999994 | 13 | 1.7% |
|
| 0.08753524 | 13 | 1.7% |
|
| -7.77522712 | 13 | 1.7% |
|
| -3.47079191 | 13 | 1.7% |
|
| Other values (72) | 613 | 81.8% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| -12.95880062 | 4 | 0.5% |
|
| -12.94031436 | 10 | 1.3% |
|
| -12.41702777 | 5 | 0.7% |
|
| -11.85139954 | 11 | 1.5% |
|
| -11.29233996 | 9 | 1.2% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 0.0011354000000000002 | 9 | 1.2% |
|
| 0.04873128 | 10 | 1.3% |
|
| 0.05854104 | 10 | 1.3% |
|
| 0.08753524 | 13 | 1.7% |
|
| 0.09403567 | 9 | 1.2% |
|
formulaA_elements_GSestBCClatcnt
Numeric
| Distinct count | 82 |
|---|---|
| Unique (%) | 10.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 3.7003 |
|---|---|
| Minimum | 2.2427 |
| Maximum | 6.1405 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 2.2427 |
|---|---|
| 5-th percentile | 2.7789 |
| Q1 | 3.1965 |
| Median | 3.7223 |
| Q3 | 4.0528 |
| 95-th percentile | 5.0289 |
| Maximum | 6.1405 |
| Range | 3.8978 |
| Interquartile range | 0.85636 |
Descriptive statistics
| Standard deviation | 0.68661 |
|---|---|
| Coef of variation | 0.18556 |
| Kurtosis | 1.5868 |
| Mean | 3.7003 |
| MAD | 0.53084 |
| Skewness | 0.84922 |
| Sum | 2771.5 |
| Variance | 0.47143 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 3.2135708380000003 | 15 | 2.0% |
|
| 4.075342789 | 14 | 1.9% |
|
| 3.940150647 | 14 | 1.9% |
|
| 3.0336942239999996 | 14 | 1.9% |
|
| 5.661284779 | 14 | 1.9% |
|
| 4.039197959 | 13 | 1.7% |
|
| 4.14761076 | 13 | 1.7% |
|
| 3.2197991210000003 | 13 | 1.7% |
|
| 4.162477568 | 13 | 1.7% |
|
| 3.7286790069999998 | 13 | 1.7% |
|
| Other values (72) | 613 | 81.8% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 2.242692341 | 8 | 1.1% |
|
| 2.429779429 | 4 | 0.5% |
|
| 2.508239481 | 8 | 1.1% |
|
| 2.7364067480000003 | 5 | 0.7% |
|
| 2.74306794 | 5 | 0.7% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 4.768954732 | 7 | 0.9% |
|
| 5.028899311 | 9 | 1.2% |
|
| 5.268200874 | 8 | 1.1% |
|
| 5.661284779 | 14 | 1.9% |
|
| 6.140481453 | 9 | 1.2% |
|
formulaA_elements_GSestFCClatcnt
Highly correlated
This variable is highly correlated with formulaA_elements_GSestBCClatcnt and should be ignored for analysis
| Correlation | 1 |
|---|
formulaA_elements_GSmagmom
Numeric
| Distinct count | 8 |
|---|---|
| Unique (%) | 1.1% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 0.036888 |
|---|---|
| Minimum | 0 |
| Maximum | 2.1107 |
| Zeros (%) | 92.7% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 0 |
| Q3 | 0 |
| 95-th percentile | 2.25e-05 |
| Maximum | 2.1107 |
| Range | 2.1107 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 0.25432 |
|---|---|
| Coef of variation | 6.8945 |
| Kurtosis | 53.215 |
| Mean | 0.036888 |
| MAD | 0.071938 |
| Skewness | 7.2892 |
| Sum | 27.629 |
| Variance | 0.064681 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 0.0 | 694 | 92.7% |
|
| 2.25e-05 | 13 | 1.7% |
|
| 0.0022471 | 10 | 1.3% |
|
| 6.35e-06 | 8 | 1.1% |
|
| 2.1106627999999996 | 8 | 1.1% |
|
| 0.00031 | 6 | 0.8% |
|
| 1.5484712 | 5 | 0.7% |
|
| 0.5953946999999999 | 5 | 0.7% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.0 | 694 | 92.7% |
|
| 6.35e-06 | 8 | 1.1% |
|
| 2.25e-05 | 13 | 1.7% |
|
| 0.00031 | 6 | 0.8% |
|
| 0.0022471 | 10 | 1.3% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 0.00031 | 6 | 0.8% |
|
| 0.0022471 | 10 | 1.3% |
|
| 0.5953946999999999 | 5 | 0.7% |
|
| 1.5484712 | 5 | 0.7% |
|
| 2.1106627999999996 | 8 | 1.1% |
|
formulaA_elements_GSvolume_pa
Highly correlated
This variable is highly correlated with formulaA_elements_GSestFCClatcnt and should be ignored for analysis
| Correlation | 0.95989 |
|---|
formulaA_elements_HHIp
Numeric
| Distinct count | 34 |
|---|---|
| Unique (%) | 4.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 4377.7 |
|---|---|
| Minimum | 0 |
| Maximum | 9800 |
| Zeros (%) | 12.6% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 1600 |
| Median | 3300 |
| Q3 | 6500 |
| 95-th percentile | 9500 |
| Maximum | 9800 |
| Range | 9800 |
| Interquartile range | 4900 |
Descriptive statistics
| Standard deviation | 3291.5 |
|---|---|
| Coef of variation | 0.75187 |
| Kurtosis | -1.1473 |
| Mean | 4377.7 |
| MAD | 2845.8 |
| Skewness | 0.40212 |
| Sum | 3278900 |
| Variance | 10834000 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 9500 | 150 | 20.0% |
|
| 0 | 94 | 12.6% |
|
| 5500 | 54 | 7.2% |
|
| 3200 | 43 | 5.7% |
|
| 1600 | 42 | 5.6% |
|
| 3300 | 35 | 4.7% |
|
| 5300 | 34 | 4.5% |
|
| 2900 | 29 | 3.9% |
|
| 1100 | 28 | 3.7% |
|
| 6000 | 23 | 3.1% |
|
| Other values (24) | 217 | 29.0% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 94 | 12.6% |
|
| 500 | 8 | 1.1% |
|
| 700 | 11 | 1.5% |
|
| 1000 | 5 | 0.7% |
|
| 1100 | 28 | 3.7% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 7900 | 12 | 1.6% |
|
| 8000 | 8 | 1.1% |
|
| 8500 | 6 | 0.8% |
|
| 9500 | 150 | 20.0% |
|
| 9800 | 7 | 0.9% |
|
formulaA_elements_HHIr
Numeric
| Distinct count | 34 |
|---|---|
| Unique (%) | 4.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 3196.1 |
|---|---|
| Minimum | 0 |
| Maximum | 9100 |
| Zeros (%) | 12.6% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 1500 |
| Median | 3100 |
| Q3 | 4300 |
| 95-th percentile | 8000 |
| Maximum | 9100 |
| Range | 9100 |
| Interquartile range | 2800 |
Descriptive statistics
| Standard deviation | 2420.3 |
|---|---|
| Coef of variation | 0.75727 |
| Kurtosis | -0.068353 |
| Mean | 3196.1 |
| MAD | 1857.2 |
| Skewness | 0.74395 |
| Sum | 2393900 |
| Variance | 5858000 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 3100 | 161 | 21.5% |
|
| 0 | 94 | 12.6% |
|
| 6000 | 49 | 6.5% |
|
| 1000 | 36 | 4.8% |
|
| 8000 | 34 | 4.5% |
|
| 1900 | 33 | 4.4% |
|
| 500 | 30 | 4.0% |
|
| 9100 | 25 | 3.3% |
|
| 1500 | 25 | 3.3% |
|
| 4800 | 23 | 3.1% |
|
| Other values (24) | 239 | 31.9% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 94 | 12.6% |
|
| 500 | 30 | 4.0% |
|
| 1000 | 36 | 4.8% |
|
| 1300 | 9 | 1.2% |
|
| 1400 | 14 | 1.9% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 6900 | 6 | 0.8% |
|
| 7200 | 8 | 1.1% |
|
| 8000 | 34 | 4.5% |
|
| 8800 | 6 | 0.8% |
|
| 9100 | 25 | 3.3% |
|
formulaA_elements_HeatCapacityMass
Numeric
| Distinct count | 73 |
|---|---|
| Unique (%) | 9.7% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 0.458 |
|---|---|
| Minimum | 0 |
| Maximum | 5.13 |
| Zeros (%) | 5.1% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0.0464 |
| Q1 | 0.16 |
| Median | 0.235 |
| Q3 | 0.449 |
| 95-th percentile | 1.228 |
| Maximum | 5.13 |
| Range | 5.13 |
| Interquartile range | 0.289 |
Descriptive statistics
| Standard deviation | 0.74783 |
|---|---|
| Coef of variation | 1.6328 |
| Kurtosis | 21.79 |
| Mean | 0.458 |
| MAD | 0.37785 |
| Skewness | 4.4666 |
| Sum | 343.04 |
| Variance | 0.55925 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 0.0 | 38 | 5.1% |
|
| 0.14 | 22 | 2.9% |
|
| 0.182 | 22 | 2.9% |
|
| 0.13 | 18 | 2.4% |
|
| 0.449 | 17 | 2.3% |
|
| 0.235 | 17 | 2.3% |
|
| 3.582 | 15 | 2.0% |
|
| 0.38799999999999996 | 14 | 1.9% |
|
| 0.168 | 14 | 1.9% |
|
| 0.363 | 14 | 1.9% |
|
| Other values (63) | 558 | 74.5% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.0 | 38 | 5.1% |
|
| 0.11599999999999999 | 9 | 1.2% |
|
| 0.11800000000000001 | 10 | 1.3% |
|
| 0.12 | 9 | 1.2% |
|
| 0.122 | 13 | 1.7% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 1.03 | 9 | 1.2% |
|
| 1.228 | 10 | 1.3% |
|
| 1.825 | 8 | 1.1% |
|
| 3.582 | 15 | 2.0% |
|
| 5.13 | 9 | 1.2% |
|
formulaA_elements_HeatCapacityMolar
Numeric
| Distinct count | 67 |
|---|---|
| Unique (%) | 8.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 25.091 |
|---|---|
| Minimum | 0 |
| Maximum | 75.69 |
| Zeros (%) | 5.1% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 3.4068 |
| Q1 | 24.44 |
| Median | 25.52 |
| Q3 | 27.2 |
| 95-th percentile | 31.75 |
| Maximum | 75.69 |
| Range | 75.69 |
| Interquartile range | 2.76 |
Descriptive statistics
| Standard deviation | 8.8572 |
|---|---|
| Coef of variation | 0.35301 |
| Kurtosis | 11.087 |
| Mean | 25.091 |
| MAD | 4.2648 |
| Skewness | 0.87191 |
| Sum | 18793 |
| Variance | 78.45 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 20.785999999999998 | 51 | 6.8% |
|
| 0.0 | 38 | 5.1% |
|
| 27.2 | 22 | 2.9% |
|
| 25.52 | 21 | 2.8% |
|
| 24.06 | 19 | 2.5% |
|
| 26.74 | 19 | 2.5% |
|
| 24.7 | 18 | 2.4% |
|
| 25.36 | 16 | 2.1% |
|
| 24.86 | 15 | 2.0% |
|
| 28.12 | 14 | 1.9% |
|
| Other values (57) | 516 | 68.9% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.0 | 38 | 5.1% |
|
| 8.517000000000001 | 8 | 1.1% |
|
| 11.087 | 4 | 0.5% |
|
| 16.442999999999998 | 8 | 1.1% |
|
| 19.99 | 8 | 1.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 31.06 | 14 | 1.9% |
|
| 32.21 | 9 | 1.2% |
|
| 37.03 | 11 | 1.5% |
|
| 54.43 | 12 | 1.6% |
|
| 75.69 | 6 | 0.8% |
|
formulaA_elements_HeatFusion
Numeric
| Distinct count | 79 |
|---|---|
| Unique (%) | 10.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 14.306 |
|---|---|
| Minimum | 0 |
| Maximum | 117.4 |
| Zeros (%) | 3.1% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 1.164 |
| Q1 | 5.46 |
| Median | 9.21 |
| Q3 | 17.48 |
| 95-th percentile | 40.108 |
| Maximum | 117.4 |
| Range | 117.4 |
| Interquartile range | 12.02 |
Descriptive statistics
| Standard deviation | 16.227 |
|---|---|
| Coef of variation | 1.1343 |
| Kurtosis | 16.137 |
| Mean | 14.306 |
| MAD | 10.413 |
| Skewness | 3.3254 |
| Sum | 10715 |
| Variance | 263.33 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 0.0 | 23 | 3.1% |
|
| 13.81 | 18 | 2.4% |
|
| 3.0 | 15 | 2.0% |
|
| 2.19 | 14 | 1.9% |
|
| 19.9 | 14 | 1.9% |
|
| 21.0 | 14 | 1.9% |
|
| 7.068 | 14 | 1.9% |
|
| 14.15 | 13 | 1.7% |
|
| 6.69 | 13 | 1.7% |
|
| 11.106 | 13 | 1.7% |
|
| Other values (69) | 598 | 79.8% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.0 | 23 | 3.1% |
|
| 0.16399999999999998 | 9 | 1.2% |
|
| 1.1640000000000001 | 13 | 1.7% |
|
| 1.18 | 10 | 1.3% |
|
| 1.7209999999999999 | 11 | 1.5% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 50.2 | 4 | 0.5% |
|
| 50.21 | 8 | 1.1% |
|
| 52.31 | 4 | 0.5% |
|
| 57.85 | 9 | 1.2% |
|
| 117.4 | 8 | 1.1% |
|
formulaA_elements_ICSDVolume
Highly correlated
This variable is highly correlated with formulaA_elements_GSvolume_pa and should be ignored for analysis
| Correlation | 0.91837 |
|---|
formulaA_elements_IsAlkali
Boolean
| Distinct count | 2 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Mean | 0.13618 |
|---|
| 0 |
647
|
|---|---|
| 1 |
|
| Value | Count | Frequency (%) | |
| 0 | 647 | 86.4% |
|
| 1 | 102 | 13.6% |
|
formulaA_elements_IsDBlock
Boolean
| Distinct count | 2 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Mean | 0.31642 |
|---|
| 0 |
512
|
|---|---|
| 1 |
237
|
| Value | Count | Frequency (%) | |
| 0 | 512 | 68.4% |
|
| 1 | 237 | 31.6% |
|
formulaA_elements_IsFBlock
Boolean
| Distinct count | 2 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Mean | 0.26168 |
|---|
| 0 |
553
|
|---|---|
| 1 |
196
|
| Value | Count | Frequency (%) | |
| 0 | 553 | 73.8% |
|
| 1 | 196 | 26.2% |
|
formulaA_elements_IsMetal
Boolean
| Distinct count | 2 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Mean | 0.82109 |
|---|
| 1 |
615
|
|---|---|
| 0 |
134
|
| Value | Count | Frequency (%) | |
| 1 | 615 | 82.1% |
|
| 0 | 134 | 17.9% |
|
formulaA_elements_IsMetalloid
Boolean
| Distinct count | 2 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Mean | 0.068091 |
|---|
| 0 |
698
|
|---|---|
| 1 |
|
| Value | Count | Frequency (%) | |
| 0 | 698 | 93.2% |
|
| 1 | 51 | 6.8% |
|
formulaA_elements_IsNonmetal
Boolean
| Distinct count | 2 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Mean | 0.13752 |
|---|
| 0 |
646
|
|---|---|
| 1 |
|
| Value | Count | Frequency (%) | |
| 0 | 646 | 86.2% |
|
| 1 | 103 | 13.8% |
|
formulaA_elements_MeltingT
Numeric
| Distinct count | 82 |
|---|---|
| Unique (%) | 10.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 1277 |
|---|---|
| Minimum | 0 |
| Maximum | 3823 |
| Zeros (%) | 1.2% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 115.79 |
| Q1 | 505.08 |
| Median | 1204 |
| Q3 | 1814 |
| 95-th percentile | 2896 |
| Maximum | 3823 |
| Range | 3823 |
| Interquartile range | 1308.9 |
Descriptive statistics
| Standard deviation | 863.93 |
|---|---|
| Coef of variation | 0.67654 |
| Kurtosis | 0.21264 |
| Mean | 1277 |
| MAD | 694.38 |
| Skewness | 0.73838 |
| Sum | 956460 |
| Variance | 746380 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 453.69 | 15 | 2.0% |
|
| 692.68 | 14 | 1.9% |
|
| 1373.0 | 14 | 1.9% |
|
| 1770.0 | 14 | 1.9% |
|
| 312.46 | 14 | 1.9% |
|
| 115.79 | 13 | 1.7% |
|
| 494.0 | 13 | 1.7% |
|
| 1941.0 | 13 | 1.7% |
|
| 544.4 | 13 | 1.7% |
|
| 1204.0 | 13 | 1.7% |
|
| Other values (72) | 613 | 81.8% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.0 | 9 | 1.2% |
|
| 24.56 | 9 | 1.2% |
|
| 83.8 | 10 | 1.3% |
|
| 115.79 | 13 | 1.7% |
|
| 161.3 | 10 | 1.3% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 3290.0 | 11 | 1.5% |
|
| 3306.0 | 9 | 1.2% |
|
| 3459.0 | 5 | 0.7% |
|
| 3695.0 | 4 | 0.5% |
|
| 3823.0 | 8 | 1.1% |
|
formulaA_elements_MendeleevNumber
Highly correlated
This variable is highly correlated with formulaA_elements_Column and should be ignored for analysis
| Correlation | 0.93584 |
|---|
formulaA_elements_MiracleRadius
Numeric
| Distinct count | 52 |
|---|---|
| Unique (%) | 6.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 141.8 |
|---|---|
| Minimum | 0 |
| Maximum | 264 |
| Zeros (%) | 10.4% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 132 |
| Median | 152 |
| Q3 | 176 |
| 95-th percentile | 223 |
| Maximum | 264 |
| Range | 264 |
| Interquartile range | 44 |
Descriptive statistics
| Standard deviation | 57.736 |
|---|---|
| Coef of variation | 0.40717 |
| Kurtosis | 1.4758 |
| Mean | 141.8 |
| MAD | 39.532 |
| Skewness | -1.1692 |
| Sum | 106206 |
| Variance | 3333.4 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 0 | 78 | 10.4% |
|
| 175 | 47 | 6.3% |
|
| 155 | 31 | 4.1% |
|
| 134 | 28 | 3.7% |
|
| 152 | 26 | 3.5% |
|
| 140 | 24 | 3.2% |
|
| 185 | 24 | 3.2% |
|
| 142 | 24 | 3.2% |
|
| 162 | 21 | 2.8% |
|
| 176 | 21 | 2.8% |
|
| Other values (42) | 425 | 56.7% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 78 | 10.4% |
|
| 77 | 8 | 1.1% |
|
| 88 | 4 | 0.5% |
|
| 102 | 2 | 0.3% |
|
| 103 | 11 | 1.5% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 212 | 7 | 0.9% |
|
| 223 | 9 | 1.2% |
|
| 230 | 8 | 1.1% |
|
| 244 | 14 | 1.9% |
|
| 264 | 9 | 1.2% |
|
formulaA_elements_NUnfilled
Numeric
| Distinct count | 17 |
|---|---|
| Unique (%) | 2.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 4.5915 |
|---|---|
| Minimum | 0 |
| Maximum | 22 |
| Zeros (%) | 20.3% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 1 |
| Median | 3 |
| Q3 | 7 |
| 95-th percentile | 17.8 |
| Maximum | 22 |
| Range | 22 |
| Interquartile range | 6 |
Descriptive statistics
| Standard deviation | 5.0201 |
|---|---|
| Coef of variation | 1.0934 |
| Kurtosis | 3.0558 |
| Mean | 4.5915 |
| MAD | 3.6835 |
| Skewness | 1.7303 |
| Sum | 3439 |
| Variance | 25.202 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 0 | 152 | 20.3% |
|
| 1 | 103 | 13.8% |
|
| 4 | 77 | 10.3% |
|
| 5 | 69 | 9.2% |
|
| 2 | 64 | 8.5% |
|
| 3 | 63 | 8.4% |
|
| 9 | 55 | 7.3% |
|
| 8 | 38 | 5.1% |
|
| 7 | 36 | 4.8% |
|
| 6 | 21 | 2.8% |
|
| Other values (7) | 71 | 9.5% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 152 | 20.3% |
|
| 1 | 103 | 13.8% |
|
| 2 | 64 | 8.5% |
|
| 3 | 63 | 8.4% |
|
| 4 | 77 | 10.3% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 16 | 11 | 1.5% |
|
| 19 | 10 | 1.3% |
|
| 20 | 9 | 1.2% |
|
| 21 | 8 | 1.1% |
|
| 22 | 11 | 1.5% |
|
formulaA_elements_NValance
Numeric
| Distinct count | 29 |
|---|---|
| Unique (%) | 3.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 10.143 |
|---|---|
| Minimum | 1 |
| Maximum | 29 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 1 |
|---|---|
| 5-th percentile | 1 |
| Q1 | 4 |
| Median | 8 |
| Q3 | 15 |
| 95-th percentile | 26 |
| Maximum | 29 |
| Range | 28 |
| Interquartile range | 11 |
Descriptive statistics
| Standard deviation | 7.303 |
|---|---|
| Coef of variation | 0.72001 |
| Kurtosis | -0.14631 |
| Mean | 10.143 |
| MAD | 6.0404 |
| Skewness | 0.77608 |
| Sum | 7597 |
| Variance | 53.334 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 1 | 56 | 7.5% |
|
| 2 | 55 | 7.3% |
|
| 4 | 55 | 7.3% |
|
| 8 | 48 | 6.4% |
|
| 3 | 47 | 6.3% |
|
| 6 | 46 | 6.1% |
|
| 5 | 36 | 4.8% |
|
| 7 | 36 | 4.8% |
|
| 16 | 33 | 4.4% |
|
| 14 | 33 | 4.4% |
|
| Other values (19) | 304 | 40.6% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 1 | 56 | 7.5% |
|
| 2 | 55 | 7.3% |
|
| 3 | 47 | 6.3% |
|
| 4 | 55 | 7.3% |
|
| 5 | 36 | 4.8% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 25 | 5 | 0.7% |
|
| 26 | 11 | 1.5% |
|
| 27 | 7 | 0.9% |
|
| 28 | 9 | 1.2% |
|
| 29 | 13 | 1.7% |
|
formulaA_elements_NdUnfilled
Numeric
| Distinct count | 10 |
|---|---|
| Unique (%) | 1.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 2.0908 |
|---|---|
| Minimum | 0 |
| Maximum | 9 |
| Zeros (%) | 67.2% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 0 |
| Q3 | 4 |
| 95-th percentile | 9 |
| Maximum | 9 |
| Range | 9 |
| Interquartile range | 4 |
Descriptive statistics
| Standard deviation | 3.359 |
|---|---|
| Coef of variation | 1.6066 |
| Kurtosis | -0.21396 |
| Mean | 2.0908 |
| MAD | 2.8443 |
| Skewness | 1.2219 |
| Sum | 1566 |
| Variance | 11.283 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 0 | 503 | 67.2% |
|
| 9 | 90 | 12.0% |
|
| 5 | 34 | 4.5% |
|
| 8 | 28 | 3.7% |
|
| 3 | 21 | 2.8% |
|
| 7 | 18 | 2.4% |
|
| 4 | 17 | 2.3% |
|
| 2 | 17 | 2.3% |
|
| 1 | 11 | 1.5% |
|
| 6 | 10 | 1.3% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 503 | 67.2% |
|
| 1 | 11 | 1.5% |
|
| 2 | 17 | 2.3% |
|
| 3 | 21 | 2.8% |
|
| 4 | 17 | 2.3% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 5 | 34 | 4.5% |
|
| 6 | 10 | 1.3% |
|
| 7 | 18 | 2.4% |
|
| 8 | 28 | 3.7% |
|
| 9 | 90 | 12.0% |
|
formulaA_elements_NdValence
Numeric
| Distinct count | 11 |
|---|---|
| Unique (%) | 1.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 4.0908 |
|---|---|
| Minimum | 0 |
| Maximum | 10 |
| Zeros (%) | 38.2% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 1 |
| Q3 | 10 |
| 95-th percentile | 10 |
| Maximum | 10 |
| Range | 10 |
| Interquartile range | 10 |
Descriptive statistics
| Standard deviation | 4.3509 |
|---|---|
| Coef of variation | 1.0636 |
| Kurtosis | -1.6373 |
| Mean | 4.0908 |
| MAD | 4.078 |
| Skewness | 0.41077 |
| Sum | 3064 |
| Variance | 18.93 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 0 | 286 | 38.2% |
|
| 10 | 217 | 29.0% |
|
| 1 | 90 | 12.0% |
|
| 5 | 34 | 4.5% |
|
| 2 | 28 | 3.7% |
|
| 7 | 21 | 2.8% |
|
| 3 | 18 | 2.4% |
|
| 8 | 17 | 2.3% |
|
| 6 | 17 | 2.3% |
|
| 9 | 11 | 1.5% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 286 | 38.2% |
|
| 1 | 90 | 12.0% |
|
| 2 | 28 | 3.7% |
|
| 3 | 18 | 2.4% |
|
| 4 | 10 | 1.3% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 6 | 17 | 2.3% |
|
| 7 | 21 | 2.8% |
|
| 8 | 17 | 2.3% |
|
| 9 | 11 | 1.5% |
|
| 10 | 217 | 29.0% |
|
formulaA_elements_NfUnfilled
Numeric
| Distinct count | 13 |
|---|---|
| Unique (%) | 1.7% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 1.5514 |
|---|---|
| Minimum | 0 |
| Maximum | 13 |
| Zeros (%) | 80.0% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 0 |
| Q3 | 0 |
| 95-th percentile | 11 |
| Maximum | 13 |
| Range | 13 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 3.4916 |
|---|---|
| Coef of variation | 2.2506 |
| Kurtosis | 2.811 |
| Mean | 1.5514 |
| MAD | 2.4932 |
| Skewness | 2.0815 |
| Sum | 1162 |
| Variance | 12.192 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 0 | 599 | 80.0% |
|
| 7 | 23 | 3.1% |
|
| 11 | 22 | 2.9% |
|
| 10 | 19 | 2.5% |
|
| 9 | 14 | 1.9% |
|
| 2 | 14 | 1.9% |
|
| 13 | 11 | 1.5% |
|
| 8 | 10 | 1.3% |
|
| 5 | 10 | 1.3% |
|
| 12 | 8 | 1.1% |
|
| Other values (3) | 19 | 2.5% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 599 | 80.0% |
|
| 1 | 8 | 1.1% |
|
| 2 | 14 | 1.9% |
|
| 3 | 6 | 0.8% |
|
| 4 | 5 | 0.7% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 9 | 14 | 1.9% |
|
| 10 | 19 | 2.5% |
|
| 11 | 22 | 2.9% |
|
| 12 | 8 | 1.1% |
|
| 13 | 11 | 1.5% |
|
formulaA_elements_NfValence
Numeric
| Distinct count | 14 |
|---|---|
| Unique (%) | 1.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 3.3084 |
|---|---|
| Minimum | 0 |
| Maximum | 14 |
| Zeros (%) | 65.3% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 0 |
| Q3 | 5 |
| 95-th percentile | 14 |
| Maximum | 14 |
| Range | 14 |
| Interquartile range | 5 |
Descriptive statistics
| Standard deviation | 5.3172 |
|---|---|
| Coef of variation | 1.6072 |
| Kurtosis | -0.15738 |
| Mean | 3.3084 |
| MAD | 4.4338 |
| Skewness | 1.2533 |
| Sum | 2478 |
| Variance | 28.272 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 0 | 489 | 65.3% |
|
| 14 | 110 | 14.7% |
|
| 7 | 23 | 3.1% |
|
| 3 | 22 | 2.9% |
|
| 4 | 19 | 2.5% |
|
| 12 | 14 | 1.9% |
|
| 5 | 14 | 1.9% |
|
| 1 | 11 | 1.5% |
|
| 9 | 10 | 1.3% |
|
| 6 | 10 | 1.3% |
|
| Other values (4) | 27 | 3.6% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 489 | 65.3% |
|
| 1 | 11 | 1.5% |
|
| 2 | 8 | 1.1% |
|
| 3 | 22 | 2.9% |
|
| 4 | 19 | 2.5% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 10 | 5 | 0.7% |
|
| 11 | 6 | 0.8% |
|
| 12 | 14 | 1.9% |
|
| 13 | 8 | 1.1% |
|
| 14 | 110 | 14.7% |
|
formulaA_elements_NpUnfilled
Numeric
| Distinct count | 6 |
|---|---|
| Unique (%) | 0.8% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 0.77036 |
|---|---|
| Minimum | 0 |
| Maximum | 5 |
| Zeros (%) | 76.9% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 0 |
| Q3 | 0 |
| 95-th percentile | 5 |
| Maximum | 5 |
| Range | 5 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 1.542 |
|---|---|
| Coef of variation | 2.0016 |
| Kurtosis | 1.6065 |
| Mean | 0.77036 |
| MAD | 1.1849 |
| Skewness | 1.7726 |
| Sum | 577 |
| Variance | 2.3777 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 0 | 576 | 76.9% |
|
| 4 | 44 | 5.9% |
|
| 5 | 42 | 5.6% |
|
| 3 | 35 | 4.7% |
|
| 2 | 34 | 4.5% |
|
| 1 | 18 | 2.4% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 576 | 76.9% |
|
| 1 | 18 | 2.4% |
|
| 2 | 34 | 4.5% |
|
| 3 | 35 | 4.7% |
|
| 4 | 44 | 5.9% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 1 | 18 | 2.4% |
|
| 2 | 34 | 4.5% |
|
| 3 | 35 | 4.7% |
|
| 4 | 44 | 5.9% |
|
| 5 | 42 | 5.6% |
|
formulaA_elements_NpValence
Numeric
| Distinct count | 7 |
|---|---|
| Unique (%) | 0.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 0.95194 |
|---|---|
| Minimum | 0 |
| Maximum | 6 |
| Zeros (%) | 71.3% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 0 |
| Q3 | 1 |
| 95-th percentile | 6 |
| Maximum | 6 |
| Range | 6 |
| Interquartile range | 1 |
Descriptive statistics
| Standard deviation | 1.7764 |
|---|---|
| Coef of variation | 1.866 |
| Kurtosis | 1.8808 |
| Mean | 0.95194 |
| MAD | 1.3574 |
| Skewness | 1.7818 |
| Sum | 713 |
| Variance | 3.1554 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 0 | 534 | 71.3% |
|
| 2 | 44 | 5.9% |
|
| 6 | 42 | 5.6% |
|
| 1 | 42 | 5.6% |
|
| 3 | 35 | 4.7% |
|
| 4 | 34 | 4.5% |
|
| 5 | 18 | 2.4% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 534 | 71.3% |
|
| 1 | 42 | 5.6% |
|
| 2 | 44 | 5.9% |
|
| 3 | 35 | 4.7% |
|
| 4 | 34 | 4.5% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 2 | 44 | 5.9% |
|
| 3 | 35 | 4.7% |
|
| 4 | 34 | 4.5% |
|
| 5 | 18 | 2.4% |
|
| 6 | 42 | 5.6% |
|
formulaA_elements_NsUnfilled
Boolean
| Distinct count | 2 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Mean | 0.17891 |
|---|
| 0 |
615
|
|---|---|
| 1 |
134
|
| Value | Count | Frequency (%) | |
| 0 | 615 | 82.1% |
|
| 1 | 134 | 17.9% |
|
formulaA_elements_NsValence
Numeric
| Distinct count | 3 |
|---|---|
| Unique (%) | 0.4% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 1.7917 |
|---|---|
| Minimum | 0 |
| Maximum | 2 |
| Zeros (%) | 1.5% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 1 |
| Q1 | 2 |
| Median | 2 |
| Q3 | 2 |
| 95-th percentile | 2 |
| Maximum | 2 |
| Range | 2 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 0.44106 |
|---|---|
| Coef of variation | 0.24616 |
| Kurtosis | 2.9406 |
| Mean | 1.7917 |
| MAD | 0.33591 |
| Skewness | -1.9422 |
| Sum | 1342 |
| Variance | 0.19453 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 2 | 604 | 80.6% |
|
| 1 | 134 | 17.9% |
|
| 0 | 11 | 1.5% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 11 | 1.5% |
|
| 1 | 134 | 17.9% |
|
| 2 | 604 | 80.6% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 0 | 11 | 1.5% |
|
| 1 | 134 | 17.9% |
|
| 2 | 604 | 80.6% |
|
formulaA_elements_Number
Highly correlated
This variable is highly correlated with formulaA_elements_AtomicWeight and should be ignored for analysis
| Correlation | 0.99873 |
|---|
formulaA_elements_Polarizability
Numeric
| Distinct count | 79 |
|---|---|
| Unique (%) | 10.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 15.246 |
|---|---|
| Minimum | -5.8167 |
| Maximum | 59.42 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | -5.8167 |
|---|---|
| 5-th percentile | 0.4 |
| Q1 | 5.6 |
| Median | 9.7 |
| Q3 | 24.8 |
| 95-th percentile | 39.7 |
| Maximum | 59.42 |
| Range | 65.237 |
| Interquartile range | 19.2 |
Descriptive statistics
| Standard deviation | 12.635 |
|---|---|
| Coef of variation | 0.82876 |
| Kurtosis | 0.70952 |
| Mean | 15.246 |
| MAD | 10.706 |
| Skewness | 0.93972 |
| Sum | 11419 |
| Variance | 159.65 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 22.7 | 21 | 2.8% |
|
| 32.1 | 19 | 2.5% |
|
| 6.8 | 17 | 2.3% |
|
| 24.335 | 15 | 2.0% |
|
| 30.1 | 14 | 1.9% |
|
| 47.27 | 14 | 1.9% |
|
| -5.816666667000001 | 14 | 1.9% |
|
| 28.2 | 13 | 1.7% |
|
| 0.4 | 13 | 1.7% |
|
| 3.77 | 13 | 1.7% |
|
| Other values (69) | 596 | 79.6% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| -5.816666667000001 | 14 | 1.9% |
|
| 0.2050522 | 9 | 1.2% |
|
| 0.39432 | 9 | 1.2% |
|
| 0.4 | 13 | 1.7% |
|
| 1.6411 | 10 | 1.3% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 32.1 | 19 | 2.5% |
|
| 39.7 | 9 | 1.2% |
|
| 43.23 | 8 | 1.1% |
|
| 47.27 | 14 | 1.9% |
|
| 59.42 | 9 | 1.2% |
|
formulaA_elements_Row
Highly correlated
This variable is highly correlated with formulaA_elements_Number and should be ignored for analysis
| Correlation | 0.95123 |
|---|
formulaA_elements_ShearModulus
Numeric
| Distinct count | 49 |
|---|---|
| Unique (%) | 6.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 31.297 |
|---|---|
| Minimum | 0 |
| Maximum | 222 |
| Zeros (%) | 27.4% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 16 |
| Q3 | 31 |
| 95-th percentile | 150 |
| Maximum | 222 |
| Range | 222 |
| Interquartile range | 31 |
Descriptive statistics
| Standard deviation | 46.988 |
|---|---|
| Coef of variation | 1.5014 |
| Kurtosis | 5.0899 |
| Mean | 31.297 |
| MAD | 31.47 |
| Skewness | 2.321 |
| Sum | 23441 |
| Variance | 2207.9 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 0.0 | 205 | 27.4% |
|
| 20.0 | 30 | 4.0% |
|
| 26.0 | 25 | 3.3% |
|
| 44.0 | 24 | 3.2% |
|
| 18.0 | 24 | 3.2% |
|
| 22.0 | 21 | 2.8% |
|
| 16.0 | 19 | 2.5% |
|
| 31.0 | 18 | 2.4% |
|
| 14.0 | 18 | 2.4% |
|
| 4.2 | 15 | 2.0% |
|
| Other values (39) | 350 | 46.7% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.0 | 205 | 27.4% |
|
| 1.3 | 8 | 1.1% |
|
| 2.8 | 7 | 0.9% |
|
| 3.3 | 10 | 1.3% |
|
| 3.7 | 13 | 1.7% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 161.0 | 4 | 0.5% |
|
| 173.0 | 11 | 1.5% |
|
| 178.0 | 5 | 0.7% |
|
| 210.0 | 5 | 0.7% |
|
| 222.0 | 9 | 1.2% |
|
formulaA_elements_SpaceGroupNumber
Numeric
| Distinct count | 16 |
|---|---|
| Unique (%) | 2.1% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 187.55 |
|---|---|
| Minimum | 2 |
| Maximum | 229 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 2 |
|---|---|
| 5-th percentile | 62.4 |
| Q1 | 194 |
| Median | 194 |
| Q3 | 225 |
| 95-th percentile | 229 |
| Maximum | 229 |
| Range | 227 |
| Interquartile range | 31 |
Descriptive statistics
| Standard deviation | 56.092 |
|---|---|
| Coef of variation | 0.29908 |
| Kurtosis | 2.5334 |
| Mean | 187.55 |
| MAD | 39.129 |
| Skewness | -1.8433 |
| Sum | 140472 |
| Variance | 3146.3 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 194 | 245 | 32.7% |
|
| 225 | 192 | 25.6% |
|
| 229 | 130 | 17.4% |
|
| 166 | 45 | 6.0% |
|
| 64 | 28 | 3.7% |
|
| 139 | 17 | 2.3% |
|
| 14 | 13 | 1.7% |
|
| 12 | 13 | 1.7% |
|
| 70 | 11 | 1.5% |
|
| 152 | 10 | 1.3% |
|
| Other values (6) | 45 | 6.0% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 2 | 2 | 0.3% |
|
| 12 | 13 | 1.7% |
|
| 14 | 13 | 1.7% |
|
| 62 | 10 | 1.3% |
|
| 63 | 9 | 1.2% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 194 | 245 | 32.7% |
|
| 217 | 6 | 0.8% |
|
| 225 | 192 | 25.6% |
|
| 227 | 8 | 1.1% |
|
| 229 | 130 | 17.4% |
|
formulaB
Categorical
| Distinct count | 82 |
|---|---|
| Unique (%) | 10.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Tc |
|
|---|---|
| Cs |
|
| Ti |
|
| Other values (79) |
702
|
| Value | Count | Frequency (%) | |
| Tc | 17 | 2.3% |
|
| Cs | 16 | 2.1% |
|
| Ti | 14 | 1.9% |
|
| Ru | 14 | 1.9% |
|
| Ag | 14 | 1.9% |
|
| Kr | 14 | 1.9% |
|
| Eu | 13 | 1.7% |
|
| Cu | 13 | 1.7% |
|
| Mg | 13 | 1.7% |
|
| Sn | 13 | 1.7% |
|
| Other values (72) | 608 | 81.2% |
|
formulaB_elements_AtomicVolume
Numeric
| Distinct count | 82 |
|---|---|
| Unique (%) | 10.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 2554.9 |
|---|---|
| Minimum | 7.2978 |
| Maximum | 37236 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 7.2978 |
|---|---|
| 5-th percentile | 10.996 |
| Q1 | 15.859 |
| Median | 24.961 |
| Q3 | 34.785 |
| 95-th percentile | 37107 |
| Maximum | 37236 |
| Range | 37229 |
| Interquartile range | 18.926 |
Descriptive statistics
| Standard deviation | 9354.8 |
|---|---|
| Coef of variation | 3.6615 |
| Kurtosis | 9.8332 |
| Mean | 2554.9 |
| MAD | 4709.8 |
| Skewness | 3.4361 |
| Sum | 1913600 |
| Variance | 87513000 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 14.15101151 | 17 | 2.3% |
|
| 117.4560158 | 16 | 2.1% |
|
| 17.63631716 | 14 | 1.9% |
|
| 17.07564795 | 14 | 1.9% |
|
| 13.56787441 | 14 | 1.9% |
|
| 37107.494739999995 | 14 | 1.9% |
|
| 48.12129236 | 13 | 1.7% |
|
| 29.52403191 | 13 | 1.7% |
|
| 18.00213298 | 13 | 1.7% |
|
| 27.20880463 | 13 | 1.7% |
|
| Other values (72) | 608 | 81.2% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 7.297767265 | 8 | 1.1% |
|
| 8.098176455 | 7 | 0.9% |
|
| 8.825089715 | 3 | 0.4% |
|
| 10.94128444 | 12 | 1.6% |
|
| 10.99586068 | 9 | 1.2% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 36952.92402 | 9 | 1.2% |
|
| 37107.494739999995 | 14 | 1.9% |
|
| 37184.28542 | 10 | 1.3% |
|
| 37232.18569 | 11 | 1.5% |
|
| 37236.035560000004 | 7 | 0.9% |
|
formulaB_elements_AtomicWeight
Numeric
| Distinct count | 82 |
|---|---|
| Unique (%) | 10.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 107.11 |
|---|---|
| Minimum | 4.0026 |
| Maximum | 238.03 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 4.0026 |
|---|---|
| 5-th percentile | 20.179 |
| Q1 | 55.845 |
| Median | 98 |
| Q3 | 157.25 |
| 95-th percentile | 208.98 |
| Maximum | 238.03 |
| Range | 234.03 |
| Interquartile range | 101.41 |
Descriptive statistics
| Standard deviation | 62.186 |
|---|---|
| Coef of variation | 0.58055 |
| Kurtosis | -0.91974 |
| Mean | 107.11 |
| MAD | 52.75 |
| Skewness | 0.28144 |
| Sum | 80229 |
| Variance | 3867 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 98.0 | 17 | 2.3% |
|
| 132.9054519 | 16 | 2.1% |
|
| 101.07 | 14 | 1.9% |
|
| 83.79799999999999 | 14 | 1.9% |
|
| 107.8682 | 14 | 1.9% |
|
| 47.867 | 14 | 1.9% |
|
| 151.964 | 13 | 1.7% |
|
| 24.305 | 13 | 1.7% |
|
| 118.71 | 13 | 1.7% |
|
| 78.96 | 13 | 1.7% |
|
| Other values (72) | 608 | 81.2% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 4.0026019999999995 | 7 | 0.9% |
|
| 6.941 | 11 | 1.5% |
|
| 9.012182000000001 | 7 | 0.9% |
|
| 10.811 | 8 | 1.1% |
|
| 12.0107 | 3 | 0.4% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 227.0 | 5 | 0.7% |
|
| 231.03586 | 7 | 0.9% |
|
| 232.03806 | 4 | 0.5% |
|
| 237.0 | 8 | 1.1% |
|
| 238.02891 | 8 | 1.1% |
|
formulaB_elements_BoilingT
Numeric
| Distinct count | 79 |
|---|---|
| Unique (%) | 10.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 2672.5 |
|---|---|
| Minimum | 4.07 |
| Maximum | 5869 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 4.07 |
|---|---|
| 5-th percentile | 119.78 |
| Q1 | 1363 |
| Median | 2875 |
| Q3 | 3675 |
| 95-th percentile | 5285 |
| Maximum | 5869 |
| Range | 5864.9 |
| Interquartile range | 2312 |
Descriptive statistics
| Standard deviation | 1545.5 |
|---|---|
| Coef of variation | 0.5783 |
| Kurtosis | -0.82309 |
| Mean | 2672.5 |
| MAD | 1302.2 |
| Skewness | 0.07295 |
| Sum | 2001700 |
| Variance | 2388600 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 4273.0 | 23 | 3.1% |
|
| 3200.0 | 22 | 2.9% |
|
| 4538.0 | 17 | 2.3% |
|
| 944.0 | 16 | 2.1% |
|
| 119.78 | 14 | 1.9% |
|
| 3560.0 | 14 | 1.9% |
|
| 4423.0 | 14 | 1.9% |
|
| 2435.0 | 14 | 1.9% |
|
| 1363.0 | 13 | 1.7% |
|
| 2875.0 | 13 | 1.7% |
|
| Other values (69) | 589 | 78.6% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 4.07 | 7 | 0.9% |
|
| 26.92 | 11 | 1.5% |
|
| 87.2 | 10 | 1.3% |
|
| 119.78 | 14 | 1.9% |
|
| 165.0 | 9 | 1.2% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 5093.0 | 4 | 0.5% |
|
| 5285.0 | 5 | 0.7% |
|
| 5731.0 | 12 | 1.6% |
|
| 5828.0 | 10 | 1.3% |
|
| 5869.0 | 12 | 1.6% |
|
formulaB_elements_BulkModulus
Numeric
| Distinct count | 49 |
|---|---|
| Unique (%) | 6.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 74.381 |
|---|---|
| Minimum | 0 |
| Maximum | 380 |
| Zeros (%) | 19.5% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 6.3 |
| Median | 38.7 |
| Q3 | 110 |
| 95-th percentile | 310 |
| Maximum | 380 |
| Range | 380 |
| Interquartile range | 103.7 |
Descriptive statistics
| Standard deviation | 92.886 |
|---|---|
| Coef of variation | 1.2488 |
| Kurtosis | 1.7239 |
| Mean | 74.381 |
| MAD | 73.04 |
| Skewness | 1.5449 |
| Sum | 55712 |
| Variance | 8627.8 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 0.0 | 146 | 19.5% |
|
| 100.0 | 34 | 4.5% |
|
| 8.3 | 26 | 3.5% |
|
| 180.0 | 26 | 3.5% |
|
| 11.0 | 22 | 2.9% |
|
| 45.0 | 22 | 2.9% |
|
| 170.0 | 21 | 2.8% |
|
| 41.0 | 20 | 2.7% |
|
| 220.0 | 20 | 2.7% |
|
| 7.7 | 19 | 2.5% |
|
| Other values (39) | 393 | 52.5% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.0 | 146 | 19.5% |
|
| 1.6 | 16 | 2.1% |
|
| 1.9 | 8 | 1.1% |
|
| 2.5 | 7 | 0.9% |
|
| 3.1 | 8 | 1.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 230.0 | 17 | 2.3% |
|
| 310.0 | 10 | 1.3% |
|
| 320.0 | 14 | 1.9% |
|
| 370.0 | 12 | 1.6% |
|
| 380.0 | 6 | 0.8% |
|
formulaB_elements_Column
Numeric
| Distinct count | 18 |
|---|---|
| Unique (%) | 2.4% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 8.2203 |
|---|---|
| Minimum | 1 |
| Maximum | 18 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 1 |
|---|---|
| 5-th percentile | 1 |
| Q1 | 3 |
| Median | 7 |
| Q3 | 13 |
| 95-th percentile | 18 |
| Maximum | 18 |
| Range | 17 |
| Interquartile range | 10 |
Descriptive statistics
| Standard deviation | 5.5447 |
|---|---|
| Coef of variation | 0.67452 |
| Kurtosis | -1.3287 |
| Mean | 8.2203 |
| MAD | 4.979 |
| Skewness | 0.33929 |
| Sum | 6157 |
| Variance | 30.744 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 3 | 175 | 23.4% |
|
| 18 | 51 | 6.8% |
|
| 1 | 51 | 6.8% |
|
| 14 | 48 | 6.4% |
|
| 13 | 42 | 5.6% |
|
| 2 | 42 | 5.6% |
|
| 15 | 39 | 5.2% |
|
| 7 | 37 | 4.9% |
|
| 11 | 33 | 4.4% |
|
| 5 | 32 | 4.3% |
|
| Other values (8) | 199 | 26.6% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 1 | 51 | 6.8% |
|
| 2 | 42 | 5.6% |
|
| 3 | 175 | 23.4% |
|
| 4 | 26 | 3.5% |
|
| 5 | 32 | 4.3% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 14 | 48 | 6.4% |
|
| 15 | 39 | 5.2% |
|
| 16 | 25 | 3.3% |
|
| 17 | 19 | 2.5% |
|
| 18 | 51 | 6.8% |
|
formulaB_elements_CovalentRadius
Numeric
| Distinct count | 57 |
|---|---|
| Unique (%) | 7.6% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 152.72 |
|---|---|
| Minimum | 28 |
| Maximum | 244 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 28 |
|---|---|
| 5-th percentile | 105 |
| Q1 | 128 |
| Median | 146 |
| Q3 | 189 |
| 95-th percentile | 207 |
| Maximum | 244 |
| Range | 216 |
| Interquartile range | 61 |
Descriptive statistics
| Standard deviation | 38.046 |
|---|---|
| Coef of variation | 0.24912 |
| Kurtosis | 0.41437 |
| Mean | 152.72 |
| MAD | 30.532 |
| Skewness | -0.10706 |
| Sum | 114390 |
| Variance | 1447.5 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 139 | 51 | 6.8% |
|
| 120 | 32 | 4.3% |
|
| 132 | 30 | 4.0% |
|
| 190 | 29 | 3.9% |
|
| 122 | 24 | 3.2% |
|
| 198 | 24 | 3.2% |
|
| 170 | 23 | 3.1% |
|
| 146 | 23 | 3.1% |
|
| 141 | 19 | 2.5% |
|
| 187 | 19 | 2.5% |
|
| Other values (47) | 475 | 63.4% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 28 | 7 | 0.9% |
|
| 58 | 11 | 1.5% |
|
| 76 | 3 | 0.4% |
|
| 84 | 8 | 1.1% |
|
| 96 | 7 | 0.9% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 206 | 4 | 0.5% |
|
| 207 | 8 | 1.1% |
|
| 215 | 11 | 1.5% |
|
| 220 | 7 | 0.9% |
|
| 244 | 16 | 2.1% |
|
formulaB_elements_Density
Numeric
| Distinct count | 80 |
|---|---|
| Unique (%) | 10.7% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 7497.9 |
|---|---|
| Minimum | 0.1785 |
| Maximum | 22590 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 0.1785 |
|---|---|
| 5-th percentile | 3.75 |
| Q1 | 2985 |
| Median | 7140 |
| Q3 | 9841 |
| 95-th percentile | 20450 |
| Maximum | 22590 |
| Range | 22590 |
| Interquartile range | 6856 |
Descriptive statistics
| Standard deviation | 5556.8 |
|---|---|
| Coef of variation | 0.74111 |
| Kurtosis | 0.4734 |
| Mean | 7497.9 |
| MAD | 4219.4 |
| Skewness | 0.91677 |
| Sum | 5616000 |
| Variance | 30878000 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 7310.0 | 24 | 3.2% |
|
| 7140.0 | 21 | 2.8% |
|
| 11500.0 | 17 | 2.3% |
|
| 1879.0 | 16 | 2.1% |
|
| 4507.0 | 14 | 1.9% |
|
| 12370.0 | 14 | 1.9% |
|
| 10490.0 | 14 | 1.9% |
|
| 3.75 | 14 | 1.9% |
|
| 8920.0 | 13 | 1.7% |
|
| 1738.0 | 13 | 1.7% |
|
| Other values (70) | 589 | 78.6% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.1785 | 7 | 0.9% |
|
| 0.9 | 11 | 1.5% |
|
| 1.784 | 10 | 1.3% |
|
| 3.75 | 14 | 1.9% |
|
| 5.9 | 9 | 1.2% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 20450.0 | 8 | 1.1% |
|
| 21020.0 | 12 | 1.6% |
|
| 21090.0 | 10 | 1.3% |
|
| 22560.0 | 6 | 0.8% |
|
| 22590.0 | 5 | 0.7% |
|
formulaB_elements_ElectronSurfaceDensityWS
Numeric
| Distinct count | 50 |
|---|---|
| Unique (%) | 6.7% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 1.1515 |
|---|---|
| Minimum | 0 |
| Maximum | 1.85 |
| Zeros (%) | 15.4% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0.91 |
| Median | 1.24 |
| Q3 | 1.63 |
| 95-th percentile | 1.81 |
| Maximum | 1.85 |
| Range | 1.85 |
| Interquartile range | 0.72 |
Descriptive statistics
| Standard deviation | 0.5774 |
|---|---|
| Coef of variation | 0.50141 |
| Kurtosis | -0.17797 |
| Mean | 1.1515 |
| MAD | 0.44395 |
| Skewness | -0.9463 |
| Sum | 862.51 |
| Variance | 0.33339 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 0.0 | 115 | 15.4% |
|
| 1.24 | 42 | 5.6% |
|
| 1.21 | 35 | 4.7% |
|
| 1.75 | 29 | 3.9% |
|
| 1.81 | 27 | 3.6% |
|
| 1.17 | 24 | 3.2% |
|
| 1.22 | 23 | 3.1% |
|
| 1.83 | 20 | 2.7% |
|
| 1.64 | 20 | 2.7% |
|
| 1.77 | 18 | 2.4% |
|
| Other values (40) | 396 | 52.9% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.0 | 115 | 15.4% |
|
| 0.55 | 16 | 2.1% |
|
| 0.6 | 7 | 0.9% |
|
| 0.65 | 8 | 1.1% |
|
| 0.81 | 6 | 0.8% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 1.77 | 18 | 2.4% |
|
| 1.78 | 10 | 1.3% |
|
| 1.81 | 27 | 3.6% |
|
| 1.83 | 20 | 2.7% |
|
| 1.85 | 17 | 2.3% |
|
formulaB_elements_Electronegativity
Numeric
| Distinct count | 67 |
|---|---|
| Unique (%) | 8.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 1.641 |
|---|---|
| Minimum | 0 |
| Maximum | 3 |
| Zeros (%) | 3.7% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0.79 |
| Q1 | 1.22 |
| Median | 1.65 |
| Q3 | 2.02 |
| 95-th percentile | 2.6 |
| Maximum | 3 |
| Range | 3 |
| Interquartile range | 0.8 |
Descriptive statistics
| Standard deviation | 0.61668 |
|---|---|
| Coef of variation | 0.37579 |
| Kurtosis | 0.35255 |
| Mean | 1.641 |
| MAD | 0.49401 |
| Skewness | -0.27564 |
| Sum | 1229.1 |
| Variance | 0.38029 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 1.9 | 54 | 7.2% |
|
| 2.2 | 30 | 4.0% |
|
| 0.0 | 28 | 3.7% |
|
| 1.22 | 20 | 2.7% |
|
| 1.5 | 19 | 2.5% |
|
| 1.36 | 19 | 2.5% |
|
| 2.55 | 16 | 2.1% |
|
| 0.79 | 16 | 2.1% |
|
| 2.28 | 16 | 2.1% |
|
| 0.82 | 15 | 2.0% |
|
| Other values (57) | 516 | 68.9% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.0 | 28 | 3.7% |
|
| 0.79 | 16 | 2.1% |
|
| 0.82 | 15 | 2.0% |
|
| 0.89 | 6 | 0.8% |
|
| 0.93 | 9 | 1.2% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 2.58 | 8 | 1.1% |
|
| 2.6 | 9 | 1.2% |
|
| 2.66 | 11 | 1.5% |
|
| 2.96 | 8 | 1.1% |
|
| 3.0 | 14 | 1.9% |
|
formulaB_elements_FirstIonizationEnergy
Numeric
| Distinct count | 82 |
|---|---|
| Unique (%) | 10.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 7.854 |
|---|---|
| Minimum | 3.8939 |
| Maximum | 24.587 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 3.8939 |
|---|---|
| 5-th percentile | 5.1391 |
| Q1 | 5.9993 |
| Median | 7.3439 |
| Q3 | 8.3369 |
| 95-th percentile | 14 |
| Maximum | 24.587 |
| Range | 20.693 |
| Interquartile range | 2.3376 |
Descriptive statistics
| Standard deviation | 3.177 |
|---|---|
| Coef of variation | 0.4045 |
| Kurtosis | 10.424 |
| Mean | 7.854 |
| MAD | 1.9893 |
| Skewness | 2.8249 |
| Sum | 5882.7 |
| Variance | 10.093 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 7.28 | 17 | 2.3% |
|
| 3.893905 | 16 | 2.1% |
|
| 13.99961 | 14 | 1.9% |
|
| 7.576230000000001 | 14 | 1.9% |
|
| 6.828119999999999 | 14 | 1.9% |
|
| 7.3605 | 14 | 1.9% |
|
| 5.42586 | 13 | 1.7% |
|
| 7.726380000000001 | 13 | 1.7% |
|
| 5.670380000000001 | 13 | 1.7% |
|
| 7.34392 | 13 | 1.7% |
|
| Other values (72) | 608 | 81.2% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 3.893905 | 16 | 2.1% |
|
| 4.177128 | 7 | 0.9% |
|
| 4.3406633 | 8 | 1.1% |
|
| 5.139076 | 9 | 1.2% |
|
| 5.17 | 5 | 0.7% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 12.12984 | 9 | 1.2% |
|
| 13.99961 | 14 | 1.9% |
|
| 15.75961 | 10 | 1.3% |
|
| 21.56454 | 11 | 1.5% |
|
| 24.587387 | 7 | 0.9% |
|
formulaB_elements_GSbandgap
Highly correlated
This variable is highly correlated with formulaB_elements_AtomicVolume and should be ignored for analysis
| Correlation | 0.91711 |
|---|
formulaB_elements_GSenergy_pa
Numeric
| Distinct count | 82 |
|---|---|
| Unique (%) | 10.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | -4.9591 |
|---|---|
| Minimum | -12.959 |
| Maximum | 0.094036 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | -12.959 |
|---|---|
| 5-th percentile | -11.851 |
| Q1 | -7.2536 |
| Median | -4.5774 |
| Q3 | -1.8699 |
| 95-th percentile | 0.048731 |
| Maximum | 0.094036 |
| Range | 13.053 |
| Interquartile range | 5.3838 |
Descriptive statistics
| Standard deviation | 3.4976 |
|---|---|
| Coef of variation | -0.70528 |
| Kurtosis | -0.54379 |
| Mean | -4.9591 |
| MAD | 2.8127 |
| Skewness | -0.57242 |
| Sum | -3714.4 |
| Variance | 12.233 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| -10.355151199999998 | 17 | 2.3% |
|
| -0.85462646 | 16 | 2.1% |
|
| -9.19463357 | 14 | 1.9% |
|
| -2.76539356 | 14 | 1.9% |
|
| 0.08753524 | 14 | 1.9% |
|
| -7.77522712 | 14 | 1.9% |
|
| -10.09332153 | 13 | 1.7% |
|
| -1.82958684 | 13 | 1.7% |
|
| -3.68065482 | 13 | 1.7% |
|
| -3.47079191 | 13 | 1.7% |
|
| Other values (72) | 608 | 81.2% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| -12.95880062 | 10 | 1.3% |
|
| -12.94031436 | 8 | 1.1% |
|
| -12.41702777 | 12 | 1.6% |
|
| -11.85139954 | 12 | 1.6% |
|
| -11.29233996 | 8 | 1.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 0.0011354000000000002 | 7 | 0.9% |
|
| 0.04873128 | 9 | 1.2% |
|
| 0.05854104 | 10 | 1.3% |
|
| 0.08753524 | 14 | 1.9% |
|
| 0.09403567 | 11 | 1.5% |
|
formulaB_elements_GSestBCClatcnt
Numeric
| Distinct count | 82 |
|---|---|
| Unique (%) | 10.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 3.641 |
|---|---|
| Minimum | 2.2427 |
| Maximum | 6.1405 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 2.2427 |
|---|---|
| 5-th percentile | 2.7431 |
| Q1 | 3.1086 |
| Median | 3.5773 |
| Q3 | 4.0154 |
| 95-th percentile | 4.769 |
| Maximum | 6.1405 |
| Range | 3.8978 |
| Interquartile range | 0.90674 |
Descriptive statistics
| Standard deviation | 0.69586 |
|---|---|
| Coef of variation | 0.19112 |
| Kurtosis | 2.4635 |
| Mean | 3.641 |
| MAD | 0.52744 |
| Skewness | 1.1916 |
| Sum | 2727.1 |
| Variance | 0.48422 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 3.057056135 | 17 | 2.3% |
|
| 6.140481453 | 16 | 2.1% |
|
| 4.162477568 | 14 | 1.9% |
|
| 3.2197991210000003 | 14 | 1.9% |
|
| 3.1964805060000003 | 14 | 1.9% |
|
| 3.000740558 | 14 | 1.9% |
|
| 3.7286790069999998 | 13 | 1.7% |
|
| 4.177811939 | 13 | 1.7% |
|
| 4.052840562 | 13 | 1.7% |
|
| 3.8648607630000003 | 13 | 1.7% |
|
| Other values (72) | 608 | 81.2% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 2.242692341 | 3 | 0.4% |
|
| 2.429779429 | 8 | 1.1% |
|
| 2.508239481 | 7 | 0.9% |
|
| 2.7364067480000003 | 9 | 1.2% |
|
| 2.74306794 | 12 | 1.6% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 4.768954732 | 7 | 0.9% |
|
| 5.028899311 | 6 | 0.8% |
|
| 5.268200874 | 8 | 1.1% |
|
| 5.661284779 | 7 | 0.9% |
|
| 6.140481453 | 16 | 2.1% |
|
formulaB_elements_GSestFCClatcnt
Highly correlated
This variable is highly correlated with formulaB_elements_GSestBCClatcnt and should be ignored for analysis
| Correlation | 1 |
|---|
formulaB_elements_GSmagmom
Numeric
| Distinct count | 8 |
|---|---|
| Unique (%) | 1.1% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 0.050732 |
|---|---|
| Minimum | 0 |
| Maximum | 2.1107 |
| Zeros (%) | 90.0% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 0 |
| Q3 | 0 |
| 95-th percentile | 0.0022471 |
| Maximum | 2.1107 |
| Range | 2.1107 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 0.28215 |
|---|---|
| Coef of variation | 5.5615 |
| Kurtosis | 37.478 |
| Mean | 0.050732 |
| MAD | 0.09745 |
| Skewness | 6.0882 |
| Sum | 37.998 |
| Variance | 0.079606 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 0.0 | 674 | 90.0% |
|
| 2.25e-05 | 14 | 1.9% |
|
| 0.0022471 | 13 | 1.7% |
|
| 0.5953946999999999 | 12 | 1.6% |
|
| 6.35e-06 | 11 | 1.5% |
|
| 1.5484712 | 9 | 1.2% |
|
| 0.00031 | 8 | 1.1% |
|
| 2.1106627999999996 | 8 | 1.1% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.0 | 674 | 90.0% |
|
| 6.35e-06 | 11 | 1.5% |
|
| 2.25e-05 | 14 | 1.9% |
|
| 0.00031 | 8 | 1.1% |
|
| 0.0022471 | 13 | 1.7% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 0.00031 | 8 | 1.1% |
|
| 0.0022471 | 13 | 1.7% |
|
| 0.5953946999999999 | 12 | 1.6% |
|
| 1.5484712 | 9 | 1.2% |
|
| 2.1106627999999996 | 8 | 1.1% |
|
formulaB_elements_GSvolume_pa
Highly correlated
This variable is highly correlated with formulaB_elements_GSestFCClatcnt and should be ignored for analysis
| Correlation | 0.9581 |
|---|
formulaB_elements_HHIp
Numeric
| Distinct count | 34 |
|---|---|
| Unique (%) | 4.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 4178.5 |
|---|---|
| Minimum | 0 |
| Maximum | 9800 |
| Zeros (%) | 12.4% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 1600 |
| Median | 3300 |
| Q3 | 6000 |
| 95-th percentile | 9500 |
| Maximum | 9800 |
| Range | 9800 |
| Interquartile range | 4400 |
Descriptive statistics
| Standard deviation | 3175.7 |
|---|---|
| Coef of variation | 0.76001 |
| Kurtosis | -0.96107 |
| Mean | 4178.5 |
| MAD | 2701.5 |
| Skewness | 0.50688 |
| Sum | 3129700 |
| Variance | 10085000 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 9500 | 120 | 16.0% |
|
| 0 | 93 | 12.4% |
|
| 5500 | 53 | 7.1% |
|
| 3300 | 49 | 6.5% |
|
| 1600 | 40 | 5.3% |
|
| 5300 | 36 | 4.8% |
|
| 3200 | 32 | 4.3% |
|
| 1100 | 29 | 3.9% |
|
| 6000 | 23 | 3.1% |
|
| 2900 | 23 | 3.1% |
|
| Other values (24) | 251 | 33.5% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 93 | 12.4% |
|
| 500 | 3 | 0.4% |
|
| 700 | 8 | 1.1% |
|
| 1000 | 12 | 1.6% |
|
| 1100 | 29 | 3.9% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 7900 | 5 | 0.7% |
|
| 8000 | 7 | 0.9% |
|
| 8500 | 13 | 1.7% |
|
| 9500 | 120 | 16.0% |
|
| 9800 | 12 | 1.6% |
|
formulaB_elements_HHIr
Numeric
| Distinct count | 34 |
|---|---|
| Unique (%) | 4.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 3133.4 |
|---|---|
| Minimum | 0 |
| Maximum | 9100 |
| Zeros (%) | 12.4% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 1500 |
| Median | 3100 |
| Q3 | 4300 |
| 95-th percentile | 8000 |
| Maximum | 9100 |
| Range | 9100 |
| Interquartile range | 2800 |
Descriptive statistics
| Standard deviation | 2389.2 |
|---|---|
| Coef of variation | 0.76249 |
| Kurtosis | 0.045095 |
| Mean | 3133.4 |
| MAD | 1842.1 |
| Skewness | 0.79989 |
| Sum | 2346900 |
| Variance | 5708100 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 3100 | 129 | 17.2% |
|
| 0 | 93 | 12.4% |
|
| 6000 | 48 | 6.4% |
|
| 1900 | 35 | 4.7% |
|
| 1500 | 34 | 4.5% |
|
| 1000 | 33 | 4.4% |
|
| 1600 | 27 | 3.6% |
|
| 500 | 25 | 3.3% |
|
| 8000 | 25 | 3.3% |
|
| 2600 | 24 | 3.2% |
|
| Other values (24) | 276 | 36.8% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 93 | 12.4% |
|
| 500 | 25 | 3.3% |
|
| 1000 | 33 | 4.4% |
|
| 1300 | 7 | 0.9% |
|
| 1400 | 22 | 2.9% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 6900 | 8 | 1.1% |
|
| 7200 | 8 | 1.1% |
|
| 8000 | 25 | 3.3% |
|
| 8800 | 13 | 1.7% |
|
| 9100 | 21 | 2.8% |
|
formulaB_elements_HeatCapacityMass
Numeric
| Distinct count | 73 |
|---|---|
| Unique (%) | 9.7% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 0.43961 |
|---|---|
| Minimum | 0 |
| Maximum | 5.13 |
| Zeros (%) | 5.2% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0.16 |
| Median | 0.243 |
| Q3 | 0.449 |
| 95-th percentile | 1.03 |
| Maximum | 5.13 |
| Range | 5.13 |
| Interquartile range | 0.289 |
Descriptive statistics
| Standard deviation | 0.66689 |
|---|---|
| Coef of variation | 1.517 |
| Kurtosis | 27.633 |
| Mean | 0.43961 |
| MAD | 0.33368 |
| Skewness | 4.9026 |
| Sum | 329.27 |
| Variance | 0.44474 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 0.0 | 39 | 5.2% |
|
| 0.182 | 23 | 3.1% |
|
| 0.14 | 21 | 2.8% |
|
| 0.235 | 19 | 2.5% |
|
| 0.449 | 17 | 2.3% |
|
| 0.242 | 16 | 2.1% |
|
| 0.524 | 14 | 1.9% |
|
| 0.248 | 14 | 1.9% |
|
| 0.23800000000000002 | 14 | 1.9% |
|
| 0.13 | 14 | 1.9% |
|
| Other values (63) | 558 | 74.5% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.0 | 39 | 5.2% |
|
| 0.11599999999999999 | 8 | 1.1% |
|
| 0.11800000000000001 | 4 | 0.5% |
|
| 0.12 | 5 | 0.7% |
|
| 0.122 | 12 | 1.6% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 1.03 | 11 | 1.5% |
|
| 1.228 | 9 | 1.2% |
|
| 1.825 | 7 | 0.9% |
|
| 3.582 | 11 | 1.5% |
|
| 5.13 | 7 | 0.9% |
|
formulaB_elements_HeatCapacityMolar
Numeric
| Distinct count | 67 |
|---|---|
| Unique (%) | 8.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 24.986 |
|---|---|
| Minimum | 0 |
| Maximum | 75.69 |
| Zeros (%) | 5.2% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 24.27 |
| Median | 25.363 |
| Q3 | 26.99 |
| 95-th percentile | 32.21 |
| Maximum | 75.69 |
| Range | 75.69 |
| Interquartile range | 2.72 |
Descriptive statistics
| Standard deviation | 9.1118 |
|---|---|
| Coef of variation | 0.36468 |
| Kurtosis | 12.062 |
| Mean | 24.986 |
| MAD | 4.2494 |
| Skewness | 1.2259 |
| Sum | 18714 |
| Variance | 83.024 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 20.785999999999998 | 51 | 6.8% |
|
| 0.0 | 39 | 5.2% |
|
| 25.36 | 24 | 3.2% |
|
| 25.52 | 23 | 3.1% |
|
| 24.06 | 21 | 2.8% |
|
| 26.74 | 17 | 2.3% |
|
| 32.21 | 16 | 2.1% |
|
| 24.7 | 14 | 1.9% |
|
| 25.06 | 14 | 1.9% |
|
| 25.1 | 14 | 1.9% |
|
| Other values (57) | 516 | 68.9% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.0 | 39 | 5.2% |
|
| 8.517000000000001 | 3 | 0.4% |
|
| 11.087 | 8 | 1.1% |
|
| 16.442999999999998 | 7 | 0.9% |
|
| 19.99 | 12 | 1.6% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 31.06 | 7 | 0.9% |
|
| 32.21 | 16 | 2.1% |
|
| 37.03 | 5 | 0.7% |
|
| 54.43 | 11 | 1.5% |
|
| 75.69 | 8 | 1.1% |
|
formulaB_elements_HeatFusion
Numeric
| Distinct count | 79 |
|---|---|
| Unique (%) | 10.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 15.344 |
|---|---|
| Minimum | 0 |
| Maximum | 117.4 |
| Zeros (%) | 1.9% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 1.164 |
| Q1 | 6.2 |
| Median | 11.106 |
| Q3 | 19.9 |
| 95-th percentile | 46.568 |
| Maximum | 117.4 |
| Range | 117.4 |
| Interquartile range | 13.7 |
Descriptive statistics
| Standard deviation | 14.698 |
|---|---|
| Coef of variation | 0.95793 |
| Kurtosis | 8.923 |
| Mean | 15.344 |
| MAD | 10.736 |
| Skewness | 2.2234 |
| Sum | 11492 |
| Variance | 216.03 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 21.0 | 21 | 2.8% |
|
| 33.29 | 17 | 2.3% |
|
| 2.09 | 16 | 2.1% |
|
| 1.1640000000000001 | 14 | 1.9% |
|
| 11.3 | 14 | 1.9% |
|
| 0.0 | 14 | 1.9% |
|
| 38.59 | 14 | 1.9% |
|
| 14.15 | 14 | 1.9% |
|
| 6.69 | 13 | 1.7% |
|
| 18.65 | 13 | 1.7% |
|
| Other values (69) | 599 | 80.0% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.0 | 14 | 1.9% |
|
| 0.16399999999999998 | 11 | 1.5% |
|
| 1.1640000000000001 | 14 | 1.9% |
|
| 1.18 | 10 | 1.3% |
|
| 1.7209999999999999 | 8 | 1.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 50.2 | 8 | 1.1% |
|
| 50.21 | 12 | 1.6% |
|
| 52.31 | 10 | 1.3% |
|
| 57.85 | 5 | 0.7% |
|
| 117.4 | 3 | 0.4% |
|
formulaB_elements_ICSDVolume
Highly correlated
This variable is highly correlated with formulaB_elements_GSestFCClatcnt and should be ignored for analysis
| Correlation | 0.92375 |
|---|
formulaB_elements_IsAlkali
Boolean
| Distinct count | 2 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Mean | 0.12417 |
|---|
| 0 |
656
|
|---|---|
| 1 |
|
| Value | Count | Frequency (%) | |
| 0 | 656 | 87.6% |
|
| 1 | 93 | 12.4% |
|
formulaB_elements_IsDBlock
Boolean
| Distinct count | 2 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Mean | 0.39119 |
|---|
| 0 |
456
|
|---|---|
| 1 |
293
|
| Value | Count | Frequency (%) | |
| 0 | 456 | 60.9% |
|
| 1 | 293 | 39.1% |
|
formulaB_elements_IsFBlock
Boolean
| Distinct count | 2 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Mean | 0.20294 |
|---|
| 0 |
597
|
|---|---|
| 1 |
152
|
| Value | Count | Frequency (%) | |
| 0 | 597 | 79.7% |
|
| 1 | 152 | 20.3% |
|
formulaB_elements_IsMetal
Boolean
| Distinct count | 2 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Mean | 0.81308 |
|---|
| 1 |
609
|
|---|---|
| 0 |
140
|
| Value | Count | Frequency (%) | |
| 1 | 609 | 81.3% |
|
| 0 | 140 | 18.7% |
|
formulaB_elements_IsMetalloid
Boolean
| Distinct count | 2 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Mean | 0.068091 |
|---|
| 0 |
698
|
|---|---|
| 1 |
|
| Value | Count | Frequency (%) | |
| 0 | 698 | 93.2% |
|
| 1 | 51 | 6.8% |
|
formulaB_elements_IsNonmetal
Boolean
| Distinct count | 2 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Mean | 0.14019 |
|---|
| 0 |
644
|
|---|---|
| 1 |
|
| Value | Count | Frequency (%) | |
| 0 | 644 | 86.0% |
|
| 1 | 105 | 14.0% |
|
formulaB_elements_MeltingT
Numeric
| Distinct count | 82 |
|---|---|
| Unique (%) | 10.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 1343.8 |
|---|---|
| Minimum | 0 |
| Maximum | 3823 |
| Zeros (%) | 0.9% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 115.79 |
| Q1 | 505.08 |
| Median | 1234.9 |
| Q3 | 1845 |
| 95-th percentile | 3290 |
| Maximum | 3823 |
| Range | 3823 |
| Interquartile range | 1339.9 |
Descriptive statistics
| Standard deviation | 907.07 |
|---|---|
| Coef of variation | 0.67499 |
| Kurtosis | -0.23011 |
| Mean | 1343.8 |
| MAD | 740.77 |
| Skewness | 0.5869 |
| Sum | 1006500 |
| Variance | 822770 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 2430.0 | 17 | 2.3% |
|
| 301.59 | 16 | 2.1% |
|
| 1941.0 | 14 | 1.9% |
|
| 1234.93 | 14 | 1.9% |
|
| 115.79 | 14 | 1.9% |
|
| 2607.0 | 14 | 1.9% |
|
| 1357.77 | 13 | 1.7% |
|
| 923.0 | 13 | 1.7% |
|
| 2750.0 | 13 | 1.7% |
|
| 494.0 | 13 | 1.7% |
|
| Other values (72) | 608 | 81.2% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.0 | 7 | 0.9% |
|
| 24.56 | 11 | 1.5% |
|
| 83.8 | 10 | 1.3% |
|
| 115.79 | 14 | 1.9% |
|
| 161.3 | 9 | 1.2% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 3290.0 | 12 | 1.6% |
|
| 3306.0 | 5 | 0.7% |
|
| 3459.0 | 12 | 1.6% |
|
| 3695.0 | 10 | 1.3% |
|
| 3823.0 | 3 | 0.4% |
|
formulaB_elements_MendeleevNumber
Highly correlated
This variable is highly correlated with formulaB_elements_Column and should be ignored for analysis
| Correlation | 0.93005 |
|---|
formulaB_elements_MiracleRadius
Numeric
| Distinct count | 52 |
|---|---|
| Unique (%) | 6.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 139.39 |
|---|---|
| Minimum | 0 |
| Maximum | 264 |
| Zeros (%) | 10.0% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 126 |
| Median | 143 |
| Q3 | 175 |
| 95-th percentile | 212 |
| Maximum | 264 |
| Range | 264 |
| Interquartile range | 49 |
Descriptive statistics
| Standard deviation | 56.567 |
|---|---|
| Coef of variation | 0.40581 |
| Kurtosis | 1.5802 |
| Mean | 139.39 |
| MAD | 38.05 |
| Skewness | -1.0415 |
| Sum | 104404 |
| Variance | 3199.8 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 0 | 75 | 10.0% |
|
| 175 | 44 | 5.9% |
|
| 134 | 33 | 4.4% |
|
| 155 | 29 | 3.9% |
|
| 126 | 25 | 3.3% |
|
| 136 | 23 | 3.1% |
|
| 162 | 23 | 3.1% |
|
| 158 | 20 | 2.7% |
|
| 152 | 20 | 2.7% |
|
| 142 | 19 | 2.5% |
|
| Other values (42) | 438 | 58.5% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 75 | 10.0% |
|
| 77 | 3 | 0.4% |
|
| 88 | 8 | 1.1% |
|
| 102 | 11 | 1.5% |
|
| 103 | 8 | 1.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 212 | 7 | 0.9% |
|
| 223 | 6 | 0.8% |
|
| 230 | 8 | 1.1% |
|
| 244 | 7 | 0.9% |
|
| 264 | 16 | 2.1% |
|
formulaB_elements_NUnfilled
Numeric
| Distinct count | 17 |
|---|---|
| Unique (%) | 2.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 4.4352 |
|---|---|
| Minimum | 0 |
| Maximum | 22 |
| Zeros (%) | 17.6% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 1 |
| Median | 4 |
| Q3 | 7 |
| 95-th percentile | 11 |
| Maximum | 22 |
| Range | 22 |
| Interquartile range | 6 |
Descriptive statistics
| Standard deviation | 4.558 |
|---|---|
| Coef of variation | 1.0277 |
| Kurtosis | 4.2426 |
| Mean | 4.4352 |
| MAD | 3.2809 |
| Skewness | 1.8567 |
| Sum | 3322 |
| Variance | 20.776 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 0 | 132 | 17.6% |
|
| 1 | 112 | 15.0% |
|
| 5 | 89 | 11.9% |
|
| 4 | 83 | 11.1% |
|
| 3 | 65 | 8.7% |
|
| 9 | 56 | 7.5% |
|
| 2 | 53 | 7.1% |
|
| 7 | 45 | 6.0% |
|
| 8 | 41 | 5.5% |
|
| 6 | 26 | 3.5% |
|
| Other values (7) | 47 | 6.3% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 132 | 17.6% |
|
| 1 | 112 | 15.0% |
|
| 2 | 53 | 7.1% |
|
| 3 | 65 | 8.7% |
|
| 4 | 83 | 11.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 16 | 5 | 0.7% |
|
| 19 | 8 | 1.1% |
|
| 20 | 8 | 1.1% |
|
| 21 | 7 | 0.9% |
|
| 22 | 8 | 1.1% |
|
formulaB_elements_NValance
Numeric
| Distinct count | 29 |
|---|---|
| Unique (%) | 3.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 10.171 |
|---|---|
| Minimum | 1 |
| Maximum | 29 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 1 |
|---|---|
| 5-th percentile | 1 |
| Q1 | 4 |
| Median | 8 |
| Q3 | 15 |
| 95-th percentile | 25 |
| Maximum | 29 |
| Range | 28 |
| Interquartile range | 11 |
Descriptive statistics
| Standard deviation | 7.1721 |
|---|---|
| Coef of variation | 0.70516 |
| Kurtosis | -0.16288 |
| Mean | 10.171 |
| MAD | 5.9314 |
| Skewness | 0.75974 |
| Sum | 7618 |
| Variance | 51.439 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 8 | 54 | 7.2% |
|
| 4 | 53 | 7.1% |
|
| 1 | 51 | 6.8% |
|
| 3 | 51 | 6.8% |
|
| 2 | 49 | 6.5% |
|
| 5 | 42 | 5.6% |
|
| 7 | 40 | 5.3% |
|
| 6 | 39 | 5.2% |
|
| 11 | 37 | 4.9% |
|
| 17 | 32 | 4.3% |
|
| Other values (19) | 301 | 40.2% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 1 | 51 | 6.8% |
|
| 2 | 49 | 6.5% |
|
| 3 | 51 | 6.8% |
|
| 4 | 53 | 7.1% |
|
| 5 | 42 | 5.6% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 25 | 6 | 0.8% |
|
| 26 | 9 | 1.2% |
|
| 27 | 4 | 0.5% |
|
| 28 | 9 | 1.2% |
|
| 29 | 12 | 1.6% |
|
formulaB_elements_NdUnfilled
Numeric
| Distinct count | 10 |
|---|---|
| Unique (%) | 1.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 2.3044 |
|---|---|
| Minimum | 0 |
| Maximum | 9 |
| Zeros (%) | 62.6% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 0 |
| Q3 | 5 |
| 95-th percentile | 9 |
| Maximum | 9 |
| Range | 9 |
| Interquartile range | 5 |
Descriptive statistics
| Standard deviation | 3.3636 |
|---|---|
| Coef of variation | 1.4596 |
| Kurtosis | -0.56836 |
| Mean | 2.3044 |
| MAD | 2.9354 |
| Skewness | 1.0404 |
| Sum | 1726 |
| Variance | 11.314 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 0 | 469 | 62.6% |
|
| 9 | 85 | 11.3% |
|
| 5 | 53 | 7.1% |
|
| 8 | 30 | 4.0% |
|
| 3 | 29 | 3.9% |
|
| 6 | 23 | 3.1% |
|
| 7 | 19 | 2.5% |
|
| 2 | 18 | 2.4% |
|
| 4 | 13 | 1.7% |
|
| 1 | 10 | 1.3% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 469 | 62.6% |
|
| 1 | 10 | 1.3% |
|
| 2 | 18 | 2.4% |
|
| 3 | 29 | 3.9% |
|
| 4 | 13 | 1.7% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 5 | 53 | 7.1% |
|
| 6 | 23 | 3.1% |
|
| 7 | 19 | 2.5% |
|
| 8 | 30 | 4.0% |
|
| 9 | 85 | 11.3% |
|
formulaB_elements_NdValence
Numeric
| Distinct count | 11 |
|---|---|
| Unique (%) | 1.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 4.2777 |
|---|---|
| Minimum | 0 |
| Maximum | 10 |
| Zeros (%) | 34.2% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 3 |
| Q3 | 10 |
| 95-th percentile | 10 |
| Maximum | 10 |
| Range | 10 |
| Interquartile range | 10 |
Descriptive statistics
| Standard deviation | 4.2505 |
|---|---|
| Coef of variation | 0.99365 |
| Kurtosis | -1.6293 |
| Mean | 4.2777 |
| MAD | 3.9324 |
| Skewness | 0.33404 |
| Sum | 3204 |
| Variance | 18.067 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 0 | 256 | 34.2% |
|
| 10 | 213 | 28.4% |
|
| 1 | 85 | 11.3% |
|
| 5 | 53 | 7.1% |
|
| 2 | 30 | 4.0% |
|
| 7 | 29 | 3.9% |
|
| 4 | 23 | 3.1% |
|
| 3 | 19 | 2.5% |
|
| 8 | 18 | 2.4% |
|
| 6 | 13 | 1.7% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 256 | 34.2% |
|
| 1 | 85 | 11.3% |
|
| 2 | 30 | 4.0% |
|
| 3 | 19 | 2.5% |
|
| 4 | 23 | 3.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 6 | 13 | 1.7% |
|
| 7 | 29 | 3.9% |
|
| 8 | 18 | 2.4% |
|
| 9 | 10 | 1.3% |
|
| 10 | 213 | 28.4% |
|
formulaB_elements_NfUnfilled
Numeric
| Distinct count | 13 |
|---|---|
| Unique (%) | 1.7% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 1.1549 |
|---|---|
| Minimum | 0 |
| Maximum | 13 |
| Zeros (%) | 84.5% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 0 |
| Q3 | 0 |
| 95-th percentile | 10 |
| Maximum | 13 |
| Range | 13 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 3.042 |
|---|---|
| Coef of variation | 2.6341 |
| Kurtosis | 5.3985 |
| Mean | 1.1549 |
| MAD | 1.9558 |
| Skewness | 2.589 |
| Sum | 865 |
| Variance | 9.2541 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 0 | 633 | 84.5% |
|
| 7 | 18 | 2.4% |
|
| 10 | 15 | 2.0% |
|
| 11 | 12 | 1.6% |
|
| 8 | 11 | 1.5% |
|
| 5 | 10 | 1.3% |
|
| 1 | 9 | 1.2% |
|
| 13 | 8 | 1.1% |
|
| 4 | 8 | 1.1% |
|
| 12 | 7 | 0.9% |
|
| Other values (3) | 18 | 2.4% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 633 | 84.5% |
|
| 1 | 9 | 1.2% |
|
| 2 | 6 | 0.8% |
|
| 3 | 5 | 0.7% |
|
| 4 | 8 | 1.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 9 | 7 | 0.9% |
|
| 10 | 15 | 2.0% |
|
| 11 | 12 | 1.6% |
|
| 12 | 7 | 0.9% |
|
| 13 | 8 | 1.1% |
|
formulaB_elements_NfValence
Numeric
| Distinct count | 14 |
|---|---|
| Unique (%) | 1.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 3.1442 |
|---|---|
| Minimum | 0 |
| Maximum | 14 |
| Zeros (%) | 69.3% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 0 |
| Q3 | 5 |
| 95-th percentile | 14 |
| Maximum | 14 |
| Range | 14 |
| Interquartile range | 5 |
Descriptive statistics
| Standard deviation | 5.3458 |
|---|---|
| Coef of variation | 1.7002 |
| Kurtosis | -0.026564 |
| Mean | 3.1442 |
| MAD | 4.4292 |
| Skewness | 1.3228 |
| Sum | 2355 |
| Variance | 28.578 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 0 | 519 | 69.3% |
|
| 14 | 114 | 15.2% |
|
| 7 | 18 | 2.4% |
|
| 4 | 15 | 2.0% |
|
| 3 | 12 | 1.6% |
|
| 6 | 11 | 1.5% |
|
| 9 | 10 | 1.3% |
|
| 13 | 9 | 1.2% |
|
| 10 | 8 | 1.1% |
|
| 1 | 8 | 1.1% |
|
| Other values (4) | 25 | 3.3% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 519 | 69.3% |
|
| 1 | 8 | 1.1% |
|
| 2 | 7 | 0.9% |
|
| 3 | 12 | 1.6% |
|
| 4 | 15 | 2.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 10 | 8 | 1.1% |
|
| 11 | 5 | 0.7% |
|
| 12 | 6 | 0.8% |
|
| 13 | 9 | 1.2% |
|
| 14 | 114 | 15.2% |
|
formulaB_elements_NpUnfilled
Numeric
| Distinct count | 6 |
|---|---|
| Unique (%) | 0.8% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 0.78505 |
|---|---|
| Minimum | 0 |
| Maximum | 5 |
| Zeros (%) | 76.9% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 0 |
| Q3 | 0 |
| 95-th percentile | 5 |
| Maximum | 5 |
| Range | 5 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 1.5626 |
|---|---|
| Coef of variation | 1.9904 |
| Kurtosis | 1.4176 |
| Mean | 0.78505 |
| MAD | 1.2074 |
| Skewness | 1.7322 |
| Sum | 588 |
| Variance | 2.4417 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 0 | 576 | 76.9% |
|
| 4 | 48 | 6.4% |
|
| 5 | 42 | 5.6% |
|
| 3 | 39 | 5.2% |
|
| 2 | 25 | 3.3% |
|
| 1 | 19 | 2.5% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 576 | 76.9% |
|
| 1 | 19 | 2.5% |
|
| 2 | 25 | 3.3% |
|
| 3 | 39 | 5.2% |
|
| 4 | 48 | 6.4% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 1 | 19 | 2.5% |
|
| 2 | 25 | 3.3% |
|
| 3 | 39 | 5.2% |
|
| 4 | 48 | 6.4% |
|
| 5 | 42 | 5.6% |
|
formulaB_elements_NpValence
Numeric
| Distinct count | 7 |
|---|---|
| Unique (%) | 0.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 0.95327 |
|---|---|
| Minimum | 0 |
| Maximum | 6 |
| Zeros (%) | 71.0% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 0 |
| Q3 | 1 |
| 95-th percentile | 6 |
| Maximum | 6 |
| Range | 6 |
| Interquartile range | 1 |
Descriptive statistics
| Standard deviation | 1.7775 |
|---|---|
| Coef of variation | 1.8647 |
| Kurtosis | 1.9792 |
| Mean | 0.95327 |
| MAD | 1.3542 |
| Skewness | 1.8021 |
| Sum | 714 |
| Variance | 3.1596 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 0 | 532 | 71.0% |
|
| 2 | 48 | 6.4% |
|
| 6 | 44 | 5.9% |
|
| 1 | 42 | 5.6% |
|
| 3 | 39 | 5.2% |
|
| 4 | 25 | 3.3% |
|
| 5 | 19 | 2.5% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 532 | 71.0% |
|
| 1 | 42 | 5.6% |
|
| 2 | 48 | 6.4% |
|
| 3 | 39 | 5.2% |
|
| 4 | 25 | 3.3% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 2 | 48 | 6.4% |
|
| 3 | 39 | 5.2% |
|
| 4 | 25 | 3.3% |
|
| 5 | 19 | 2.5% |
|
| 6 | 44 | 5.9% |
|
formulaB_elements_NsUnfilled
Boolean
| Distinct count | 2 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Mean | 0.19092 |
|---|
| 0 |
606
|
|---|---|
| 1 |
143
|
| Value | Count | Frequency (%) | |
| 0 | 606 | 80.9% |
|
| 1 | 143 | 19.1% |
|
formulaB_elements_NsValence
Numeric
| Distinct count | 3 |
|---|---|
| Unique (%) | 0.4% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 1.7957 |
|---|---|
| Minimum | 0 |
| Maximum | 2 |
| Zeros (%) | 0.7% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 1 |
| Q1 | 2 |
| Median | 2 |
| Q3 | 2 |
| 95-th percentile | 2 |
| Maximum | 2 |
| Range | 2 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 0.41968 |
|---|---|
| Coef of variation | 0.23371 |
| Kurtosis | 1.7826 |
| Mean | 1.7957 |
| MAD | 0.32782 |
| Skewness | -1.7387 |
| Sum | 1345 |
| Variance | 0.17613 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 2 | 601 | 80.2% |
|
| 1 | 143 | 19.1% |
|
| 0 | 5 | 0.7% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 5 | 0.7% |
|
| 1 | 143 | 19.1% |
|
| 2 | 601 | 80.2% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 0 | 5 | 0.7% |
|
| 1 | 143 | 19.1% |
|
| 2 | 601 | 80.2% |
|
formulaB_elements_Number
Highly correlated
This variable is highly correlated with formulaB_elements_AtomicWeight and should be ignored for analysis
| Correlation | 0.99874 |
|---|
formulaB_elements_Polarizability
Numeric
| Distinct count | 79 |
|---|---|
| Unique (%) | 10.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 14.386 |
|---|---|
| Minimum | -5.8167 |
| Maximum | 59.42 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | -5.8167 |
|---|---|
| 5-th percentile | 0.4 |
| Q1 | 5.8 |
| Median | 9.65 |
| Q3 | 23.5 |
| 95-th percentile | 32.1 |
| Maximum | 59.42 |
| Range | 65.237 |
| Interquartile range | 17.7 |
Descriptive statistics
| Standard deviation | 12.447 |
|---|---|
| Coef of variation | 0.86521 |
| Kurtosis | 2.1957 |
| Mean | 14.386 |
| MAD | 9.9403 |
| Skewness | 1.3544 |
| Sum | 10775 |
| Variance | 154.92 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 6.8 | 19 | 2.5% |
|
| 22.7 | 18 | 2.4% |
|
| 11.4 | 17 | 2.3% |
|
| 59.42 | 16 | 2.1% |
|
| 9.6 | 14 | 1.9% |
|
| 6.99 | 14 | 1.9% |
|
| 2.4844 | 14 | 1.9% |
|
| 14.6 | 14 | 1.9% |
|
| 27.7 | 13 | 1.7% |
|
| 10.76666667 | 13 | 1.7% |
|
| Other values (69) | 597 | 79.7% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| -5.816666667000001 | 12 | 1.6% |
|
| 0.2050522 | 7 | 0.9% |
|
| 0.39432 | 11 | 1.5% |
|
| 0.4 | 12 | 1.6% |
|
| 1.6411 | 10 | 1.3% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 32.1 | 9 | 1.2% |
|
| 39.7 | 6 | 0.8% |
|
| 43.23 | 8 | 1.1% |
|
| 47.27 | 7 | 0.9% |
|
| 59.42 | 16 | 2.1% |
|
formulaB_elements_Row
Highly correlated
This variable is highly correlated with formulaB_elements_Number and should be ignored for analysis
| Correlation | 0.94875 |
|---|
formulaB_elements_ShearModulus
Numeric
| Distinct count | 49 |
|---|---|
| Unique (%) | 6.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 33.427 |
|---|---|
| Minimum | 0 |
| Maximum | 222 |
| Zeros (%) | 30.2% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 17 |
| Q3 | 38 |
| 95-th percentile | 161 |
| Maximum | 222 |
| Range | 222 |
| Interquartile range | 38 |
Descriptive statistics
| Standard deviation | 48.715 |
|---|---|
| Coef of variation | 1.4573 |
| Kurtosis | 3.7695 |
| Mean | 33.427 |
| MAD | 33.922 |
| Skewness | 2.0794 |
| Sum | 25037 |
| Variance | 2373.1 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 0.0 | 226 | 30.2% |
|
| 26.0 | 24 | 3.2% |
|
| 20.0 | 23 | 3.1% |
|
| 18.0 | 20 | 2.7% |
|
| 27.0 | 19 | 2.5% |
|
| 44.0 | 19 | 2.5% |
|
| 14.0 | 16 | 2.1% |
|
| 22.0 | 15 | 2.0% |
|
| 173.0 | 14 | 1.9% |
|
| 30.0 | 14 | 1.9% |
|
| Other values (39) | 359 | 47.9% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.0 | 226 | 30.2% |
|
| 1.3 | 8 | 1.1% |
|
| 2.8 | 4 | 0.5% |
|
| 3.3 | 9 | 1.2% |
|
| 3.7 | 13 | 1.7% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 161.0 | 10 | 1.3% |
|
| 173.0 | 14 | 1.9% |
|
| 178.0 | 12 | 1.6% |
|
| 210.0 | 6 | 0.8% |
|
| 222.0 | 5 | 0.7% |
|
formulaB_elements_SpaceGroupNumber
Numeric
| Distinct count | 16 |
|---|---|
| Unique (%) | 2.1% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 186.14 |
|---|---|
| Minimum | 2 |
| Maximum | 229 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 2 |
|---|---|
| 5-th percentile | 62 |
| Q1 | 194 |
| Median | 194 |
| Q3 | 225 |
| 95-th percentile | 229 |
| Maximum | 229 |
| Range | 227 |
| Interquartile range | 31 |
Descriptive statistics
| Standard deviation | 58.864 |
|---|---|
| Coef of variation | 0.31624 |
| Kurtosis | 2.4045 |
| Mean | 186.14 |
| MAD | 41.4 |
| Skewness | -1.8316 |
| Sum | 139416 |
| Variance | 3465 |
| Memory size | 5.9 KiB |
| Value | Count | Frequency (%) | |
| 194 | 242 | 32.3% |
|
| 225 | 181 | 24.2% |
|
| 229 | 136 | 18.2% |
|
| 166 | 44 | 5.9% |
|
| 64 | 31 | 4.1% |
|
| 139 | 18 | 2.4% |
|
| 141 | 13 | 1.7% |
|
| 14 | 13 | 1.7% |
|
| 227 | 12 | 1.6% |
|
| 12 | 12 | 1.6% |
|
| Other values (6) | 47 | 6.3% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 2 | 11 | 1.5% |
|
| 12 | 12 | 1.6% |
|
| 14 | 13 | 1.7% |
|
| 62 | 8 | 1.1% |
|
| 63 | 8 | 1.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 194 | 242 | 32.3% |
|
| 217 | 8 | 1.1% |
|
| 225 | 181 | 24.2% |
|
| 227 | 12 | 1.6% |
|
| 229 | 136 | 18.2% |
|
| formulaA | formulaB | formulaA_elements_AtomicVolume | formulaB_elements_AtomicVolume | formulaA_elements_AtomicWeight | formulaB_elements_AtomicWeight | formulaA_elements_BoilingT | formulaB_elements_BoilingT | formulaA_elements_BulkModulus | formulaB_elements_BulkModulus | formulaA_elements_Column | formulaB_elements_Column | formulaA_elements_CovalentRadius | formulaB_elements_CovalentRadius | formulaA_elements_Density | formulaB_elements_Density | formulaA_elements_ElectronSurfaceDensityWS | formulaB_elements_ElectronSurfaceDensityWS | formulaA_elements_Electronegativity | formulaB_elements_Electronegativity | formulaA_elements_FirstIonizationEnergy | formulaB_elements_FirstIonizationEnergy | formulaA_elements_GSbandgap | formulaB_elements_GSbandgap | formulaA_elements_GSenergy_pa | formulaB_elements_GSenergy_pa | formulaA_elements_GSestBCClatcnt | formulaB_elements_GSestBCClatcnt | formulaA_elements_GSestFCClatcnt | formulaB_elements_GSestFCClatcnt | formulaA_elements_GSmagmom | formulaB_elements_GSmagmom | formulaA_elements_GSvolume_pa | formulaB_elements_GSvolume_pa | formulaA_elements_HHIp | formulaB_elements_HHIp | formulaA_elements_HHIr | formulaB_elements_HHIr | formulaA_elements_HeatCapacityMass | formulaB_elements_HeatCapacityMass | formulaA_elements_HeatCapacityMolar | formulaB_elements_HeatCapacityMolar | formulaA_elements_HeatFusion | formulaB_elements_HeatFusion | formulaA_elements_ICSDVolume | formulaB_elements_ICSDVolume | formulaA_elements_IsAlkali | formulaB_elements_IsAlkali | formulaA_elements_IsDBlock | formulaB_elements_IsDBlock | formulaA_elements_IsFBlock | formulaB_elements_IsFBlock | formulaA_elements_IsMetal | formulaB_elements_IsMetal | formulaA_elements_IsMetalloid | formulaB_elements_IsMetalloid | formulaA_elements_IsNonmetal | formulaB_elements_IsNonmetal | formulaA_elements_MeltingT | formulaB_elements_MeltingT | formulaA_elements_MendeleevNumber | formulaB_elements_MendeleevNumber | formulaA_elements_MiracleRadius | formulaB_elements_MiracleRadius | formulaA_elements_NUnfilled | formulaB_elements_NUnfilled | formulaA_elements_NValance | formulaB_elements_NValance | formulaA_elements_NdUnfilled | formulaB_elements_NdUnfilled | formulaA_elements_NdValence | formulaB_elements_NdValence | formulaA_elements_NfUnfilled | formulaB_elements_NfUnfilled | formulaA_elements_NfValence | formulaB_elements_NfValence | formulaA_elements_NpUnfilled | formulaB_elements_NpUnfilled | formulaA_elements_NpValence | formulaB_elements_NpValence | formulaA_elements_NsUnfilled | formulaB_elements_NsUnfilled | formulaA_elements_NsValence | formulaB_elements_NsValence | formulaA_elements_Number | formulaB_elements_Number | formulaA_elements_Polarizability | formulaB_elements_Polarizability | formulaA_elements_Row | formulaB_elements_Row | formulaA_elements_ShearModulus | formulaB_elements_ShearModulus | formulaA_elements_SpaceGroupNumber | formulaB_elements_SpaceGroupNumber | avg_coordination_A | avg_coordination_B | avg_nearest_neighbor_distance_A | avg_nearest_neighbor_distance_B | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Ac | Be | 37.433086 | 8.098176 | 227.0 | 9.012182 | 3473.0 | 2743.0 | 0.0 | 130.0 | 3 | 2 | 215 | 96 | 10070.0 | 1848.0 | 0.0 | 1.67 | 1.1 | 1.57 | 5.17 | 9.322700 | 0.0 | 0.0 | -4.105002 | -3.755039 | 4.465163 | 2.508239 | 5.625753 | 3.160184 | 0.0 | 0.0 | 44.5125 | 7.890000 | 0 | 8000 | 0 | 4000 | 0.12 | 1.825 | 27.2 | 16.443 | 12.0 | 7.895 | 37.45 | 8.11 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1323.0 | 1560.00 | 14 | 67 | 0 | 112 | 9 | 0 | 3 | 2 | 9 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 89 | 4 | 32.1 | 5.60 | 7 | 2 | 0.0 | 132.0 | 225 | 194 | 12.0 | 12.0 | 3.99462 | 2.20087 |
| 1 | Ac | Cd | 37.433086 | 21.580025 | 227.0 | 112.411000 | 3473.0 | 1040.0 | 0.0 | 42.0 | 3 | 12 | 215 | 144 | 10070.0 | 8650.0 | 0.0 | 1.24 | 1.1 | 1.69 | 5.17 | 8.993820 | 0.0 | 0.0 | -4.105002 | -0.812174 | 4.465163 | 3.390922 | 5.625753 | 4.272293 | 0.0 | 0.0 | 44.5125 | 19.495000 | 0 | 1700 | 0 | 1300 | 0.12 | 0.231 | 27.2 | 26.020 | 12.0 | 6.210 | 37.45 | 21.59 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1323.0 | 594.22 | 14 | 70 | 0 | 157 | 9 | 0 | 3 | 12 | 9 | 0 | 1 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 89 | 48 | 32.1 | 7.32 | 7 | 5 | 0.0 | 19.0 | 225 | 194 | 12.0 | 6.0 | 3.99462 | 3.00785 |
| 2 | Ac | Cs | 37.433086 | 117.456016 | 227.0 | 132.905452 | 3473.0 | 944.0 | 0.0 | 1.6 | 3 | 1 | 215 | 244 | 10070.0 | 1879.0 | 0.0 | 0.55 | 1.1 | 0.79 | 5.17 | 3.893905 | 0.0 | 0.0 | -4.105002 | -0.854626 | 4.465163 | 6.140481 | 5.625753 | 7.736522 | 0.0 | 0.0 | 44.5125 | 115.765000 | 0 | 6000 | 0 | 6000 | 0.12 | 0.242 | 27.2 | 32.210 | 12.0 | 2.090 | 37.45 | 67.55 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1323.0 | 301.59 | 14 | 5 | 0 | 264 | 9 | 1 | 3 | 1 | 9 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | 89 | 55 | 32.1 | 59.42 | 7 | 6 | 0.0 | 0.0 | 225 | 229 | 12.0 | 8.0 | 3.99462 | 5.32395 |
| 3 | Ac | Ho | 37.433086 | 31.140380 | 227.0 | 164.930320 | 3473.0 | 2973.0 | 0.0 | 40.0 | 3 | 3 | 215 | 192 | 10070.0 | 8795.0 | 0.0 | 1.22 | 1.1 | 1.23 | 5.17 | 6.021500 | 0.0 | 0.0 | -4.105002 | -4.577417 | 4.465163 | 3.946510 | 5.625753 | 4.972291 | 0.0 | 0.0 | 44.5125 | 30.733333 | 0 | 9500 | 0 | 3100 | 0.12 | 0.165 | 27.2 | 27.150 | 12.0 | 11.760 | 37.45 | 31.11 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1323.0 | 1747.00 | 14 | 33 | 0 | 177 | 9 | 3 | 3 | 13 | 9 | 0 | 1 | 0 | 0 | 3 | 0 | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 89 | 67 | 32.1 | 23.60 | 7 | 6 | 0.0 | 26.0 | 225 | 194 | 12.0 | 12.0 | 3.99462 | 3.48112 |
| 4 | Ac | K | 37.433086 | 75.847865 | 227.0 | 39.098300 | 3473.0 | 1032.0 | 0.0 | 3.1 | 3 | 1 | 215 | 203 | 10070.0 | 856.0 | 0.0 | 0.65 | 1.1 | 0.82 | 5.17 | 4.340663 | 0.0 | 0.0 | -4.105002 | -1.097540 | 4.465163 | 5.268201 | 5.625753 | 6.637517 | 0.0 | 0.0 | 44.5125 | 73.106667 | 0 | 1700 | 0 | 7200 | 0.12 | 0.757 | 27.2 | 29.600 | 12.0 | 2.335 | 37.45 | 75.63 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1323.0 | 336.53 | 14 | 3 | 0 | 230 | 9 | 1 | 3 | 1 | 9 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 1 | 89 | 19 | 32.1 | 43.23 | 7 | 4 | 0.0 | 1.3 | 225 | 229 | 12.0 | 8.0 | 3.99462 | 4.57083 |
# Prepare test data into binary format(same dimension as traing data)for prediction
def test_to_binary(test):
df_new = pd.DataFrame()
for i in range(11):
df = test.copy()
df['B ratio'] = i * 10
df_new = pd.concat([df, df_new])
# drop the highly correlated features
df_new = df_new.drop(['formulaA_elements_GSbandgap','formulaA_elements_GSestBCClatcnt', 'formulaA_elements_GSvolume_pa',
'formulaA_elements_ICSDVolume','formulaA_elements_Column', 'formulaA_elements_Number',
'formulaA_elements_Row','formulaB_elements_GSestBCClatcnt', 'formulaB_elements_GSvolume_pa',
'formulaB_elements_ICSDVolume','formulaB_elements_Column', 'formulaB_elements_Number',
'formulaB_elements_Row', 'formulaA', 'formulaB'], axis = 1)
return df_new
# Using GBDT to predict the test data
def GBDT(X, y, test):
df_test = test.sort_values(by=['formulaA_elements_AtomicVolume','formulaB_elements_AtomicVolume','B ratio'])
df_test = df_test.reset_index()
df_test = df_test.drop('index', axis = 1)
# Use best model to train then predict
clf = GradientBoostingClassifier(n_estimators = 1000, max_depth = 5, learning_rate = 0.1)
clf.fit(X, y)
test_label = clf.predict(df_test)
df_test['label'] = test_label
return df_test
# After prediction, transform the test set back to orignial format with stability vector in string data type.
def binary_to_test(df_test):
dfy = pd.DataFrame()
for i in range(749):
dfx = df_test.loc[i * 11 : i * 11 + 10]
dfx['stabilityVec'] = np.nan
result = []
for j in dfx.index:
result.append(str(dfx.loc[j]['label']))
# stabilityVec in str data type
dfx['stabilityVec'][j] = '[' + str(','.join(k for k in result)) + ']'
dfy = pd.concat([dfy, dfx])
dfy = dfy.drop('B ratio', axis = 1)
dfy = dfy.dropna()
dfy = dfy.drop('label', axis = 1)
return dfy
#df_new.sort_values(by=['formulaA', 'formulaB'])
test_df = test_to_binary(test)
test_df.head()
| B ratio | avg_coordination_A | avg_coordination_B | avg_nearest_neighbor_distance_A | avg_nearest_neighbor_distance_B | formulaA_elements_AtomicVolume | formulaA_elements_AtomicWeight | formulaA_elements_BoilingT | formulaA_elements_BulkModulus | formulaA_elements_CovalentRadius | ... | formulaB_elements_NdValence | formulaB_elements_NfUnfilled | formulaB_elements_NfValence | formulaB_elements_NpUnfilled | formulaB_elements_NpValence | formulaB_elements_NsUnfilled | formulaB_elements_NsValence | formulaB_elements_Polarizability | formulaB_elements_ShearModulus | formulaB_elements_SpaceGroupNumber | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100 | 12.0 | 12.0 | 3.99462 | 2.20087 | 37.433086 | 227.0 | 3473.0 | 0.0 | 215 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 5.60 | 132.0 | 194 |
| 1 | 100 | 12.0 | 6.0 | 3.99462 | 3.00785 | 37.433086 | 227.0 | 3473.0 | 0.0 | 215 | ... | 10 | 0 | 0 | 0 | 0 | 0 | 2 | 7.32 | 19.0 | 194 |
| 2 | 100 | 12.0 | 8.0 | 3.99462 | 5.32395 | 37.433086 | 227.0 | 3473.0 | 0.0 | 215 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 59.42 | 0.0 | 229 |
| 3 | 100 | 12.0 | 12.0 | 3.99462 | 3.48112 | 37.433086 | 227.0 | 3473.0 | 0.0 | 215 | ... | 0 | 3 | 11 | 0 | 0 | 0 | 2 | 23.60 | 26.0 | 194 |
| 4 | 100 | 12.0 | 8.0 | 3.99462 | 4.57083 | 37.433086 | 227.0 | 3473.0 | 0.0 | 215 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 43.23 | 1.3 | 229 |
5 rows × 84 columns
test_label = GBDT(X, y, test_df)
submission_test = binary_to_test(test_label)
sub = submission_test[['formulaA_elements_AtomicWeight','formulaB_elements_AtomicWeight', 'stabilityVec']]
df_test_submission = pd.merge(test, sub, on=['formulaA_elements_AtomicWeight','formulaB_elements_AtomicWeight'], how = 'inner')
#df_test_submission.sort_values(by=['formulaA_elements_AtomicVolume','formulaB_elements_AtomicVolume'])
df_test_submission.to_csv('test_data_submission.csv')
# df_ml is the training data with 11 classes separated:
df_ml, _, _ = binary_composition(df_new)
print(df_ml.shape)
(2572, 94)
# Prepare the multi-label data set.
# y: only consider class 1 to class 9 where A and B are in compound form.
X_ml = df_ml.iloc[:, :-11]
y_ml = df_ml.iloc[:, -10: -1]
y_ml = y_ml.astype(np.int)
print(X_ml.shape)
print(y_ml.shape)
(2572, 83) (2572, 9)
#1. Binary Relevance:
from imblearn.over_sampling import SMOTE
# Step 1: initialize the global variables.
# Step 2: standardize the training data and test data. ~N(0, 1)
# Step 3: train each class (9 classes) separately. for each class, upsampling minority class by SMOTE
# Step 4: output F1 scores and confusion matrix for each classifier
class BRClassifier():
def __init__(self, estimator, scoring = [precision_score,recall_score,f1_score], test_size = 0.25, random_state = 1):
self.scoring = scoring
self.estimator = clone(estimator)
self.test_size = test_size
self.random_state = random_state
def standardization(self, X_train, X_test):
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
return X_train_scaled, X_test_scaled
def fit(self, X, y):
# partition data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size = self.test_size,random_state=self.random_state,
shuffle = True)
X_train_scaled, X_test_scaled = self.standardization(X_train, X_test)
true_neg, false_pos, false_neg, true_pos = 0, 0, 0, 0
# Build 9 different binary classifiers and train each class separately.
# print the F1 score for each class prediction
# record the True Positve, True Negative, False Negative and True Positive for each class and aggragate
# into 1 confusion matrix for overall model evaluation.
for label in y_train.columns:
# output F1 score and confusion matrix for each classifier
scores, cm = self._calc_score(X_train_scaled, y_train[label], X_test_scaled, y_test[label])
self.scores_ = {'Precision': np.round(scores[0], 3),'Recall': np.round(scores[1], 3),
'F1': np.round(scores[2], 3)}
print (label, self.scores_)
# Aggregate True Negative, False Positive, False Negative and True Positives
true_neg += cm[0][0]
false_pos += cm[0][1]
false_neg += cm[1][0]
true_pos += cm[1][1]
# Build Confusion Matrix based on total 9 classes
confusionMatrix = np.array([[true_neg, false_pos], [false_neg, true_pos]])
precision = true_pos/(true_pos + false_pos)
recall = true_pos/(true_pos + false_neg)
f1 = 2*precision*recall/(precision + recall)
print ('\n9 Classes Overall Precision: %.3f, Recall: %.3f, F1 score: %.3f' %(precision, recall, f1))
# Plot Confusion Matrix
fig, ax = plt.subplots(figsize=(3, 3))
ax.matshow(confusionMatrix, cmap=plt.cm.Blues, alpha=0.3)
for i in range(confusionMatrix.shape[0]):
for j in range(confusionMatrix.shape[1]):
ax.text(x=j, y=i, s=confusionMatrix[i, j], va='center', ha='center')
plt.title('9 Classes Overall Confusion Matrix', y = 1.15)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.show()
def _calc_score(self, X_train_scaled, y_train, X_test_scaled, y_test):
# Synthetic Minority Upsampling to training data ONLY
smt = SMOTE()
X_train_smt, y_train_smt = smt.fit_resample(X_train_scaled, y_train)
self.estimator.fit(X_train_smt, y_train_smt)
y_pred = self.estimator.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred)
scores = []
for score_type in self.scoring:
score = score_type(y_test, y_pred)
scores.append(score)
return scores, cm
lr = LogisticRegression(penalty = 'l1', C = 1.0, class_weight='balanced')
clf = BRClassifier(lr)
clf.fit(X_ml, y_ml)
class_1 {'Precision': 0.12, 'Recall': 0.545, 'F1': 0.197}
class_2 {'Precision': 0.13, 'Recall': 0.667, 'F1': 0.218}
class_3 {'Precision': 0.5, 'Recall': 0.788, 'F1': 0.612}
class_4 {'Precision': 0.213, 'Recall': 0.675, 'F1': 0.323}
class_5 {'Precision': 0.432, 'Recall': 0.748, 'F1': 0.547}
class_6 {'Precision': 0.177, 'Recall': 0.636, 'F1': 0.277}
class_7 {'Precision': 0.284, 'Recall': 0.798, 'F1': 0.419}
class_8 {'Precision': 0.373, 'Recall': 0.735, 'F1': 0.495}
class_9 {'Precision': 0.244, 'Recall': 0.769, 'F1': 0.37}
9 Classes Overall Precision: 0.322, Recall: 0.744, F1 score: 0.449
svm = SVC(kernel = 'rbf', C = 100, gamma = 0.01,class_weight='balanced')
clf_svm = BRClassifier(svm)
clf_svm.fit(X_ml, y_ml)
class_1 {'Precision': 0.312, 'Recall': 0.455, 'F1': 0.37}
class_2 {'Precision': 0.269, 'Recall': 0.389, 'F1': 0.318}
class_3 {'Precision': 0.647, 'Recall': 0.678, 'F1': 0.662}
class_4 {'Precision': 0.333, 'Recall': 0.45, 'F1': 0.383}
class_5 {'Precision': 0.687, 'Recall': 0.667, 'F1': 0.677}
class_6 {'Precision': 0.396, 'Recall': 0.477, 'F1': 0.433}
class_7 {'Precision': 0.467, 'Recall': 0.5, 'F1': 0.483}
class_8 {'Precision': 0.667, 'Recall': 0.612, 'F1': 0.638}
class_9 {'Precision': 0.333, 'Recall': 0.385, 'F1': 0.357}
9 Classes Overall Precision: 0.553, Recall: 0.589, F1 score: 0.570
# GBDT did not outperform RF: for each class label, it is sparse and GBDT did not do well on this type of problem
# Next step is to fine tune hyperparameter to see if I can make it better
gbdt = GradientBoostingClassifier(n_estimators = 1000, max_depth = 5, learning_rate = 0.1)
clf_gbdt = BRClassifier(gbdt)
clf_gbdt.fit(X_ml, y_ml)
class_1 {'Precision': 0.278, 'Recall': 0.455, 'F1': 0.345}
class_2 {'Precision': 0.263, 'Recall': 0.278, 'F1': 0.27}
class_3 {'Precision': 0.701, 'Recall': 0.61, 'F1': 0.652}
class_4 {'Precision': 0.514, 'Recall': 0.45, 'F1': 0.48}
class_5 {'Precision': 0.719, 'Recall': 0.681, 'F1': 0.7}
class_6 {'Precision': 0.543, 'Recall': 0.432, 'F1': 0.481}
class_7 {'Precision': 0.606, 'Recall': 0.512, 'F1': 0.555}
class_8 {'Precision': 0.61, 'Recall': 0.51, 'F1': 0.556}
class_9 {'Precision': 0.7, 'Recall': 0.538, 'F1': 0.609}
9 Classes Overall Precision: 0.625, Recall: 0.557, F1 score: 0.589
# grid search best hyperparamters for GBDT
def gridSearchGBDT(X, y):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1,random_state=1,shuffle = True)
# list of hyperparamter range
parameters = {
'n_estimators': [500, 1000],
'max_depth': [3, 5, 8],
'learning_rate': [0.01, 0.1, 1]
}
for label in y_train.columns:
# upsampling minority class
smt = SMOTE()
X_smt, y_smt = smt.fit_resample(X_train, y_train[label])
# grid search no. of trees, max depth and learning rate: 2 x 3 x 3 x 9(classes) = 162 fittings
for n in parameters['n_estimators']:
for d in parameters['max_depth']:
for l in parameters['learning_rate']:
gbt = GradientBoostingClassifier(n_estimators= n, learning_rate=l, max_depth=d)
gbt.fit(X_smt, y_smt)
y_pred = gbt.predict(X_test)
precision = precision_score(y_test[label], y_pred)
recall = recall_score(y_test[label], y_pred)
f1 = f1_score(y_test[label], y_pred)
print('class: %s, n_estimators: %d, max_depth: %d, learning_rate: %.3f, precision: %.3f, recall: %.3f, F1: %.3f' %
(label, n, d,l, precision, recall, f1))
gridSearchGBDT(X_ml, y_ml)
class: class_1, n_estimators: 500, max_depth: 3, learning_rate: 0.010, precision: 0.250, recall: 0.400, F1: 0.308 class: class_1, n_estimators: 500, max_depth: 3, learning_rate: 0.100, precision: 0.250, recall: 0.200, F1: 0.222 class: class_1, n_estimators: 500, max_depth: 3, learning_rate: 1.000, precision: 0.200, recall: 0.200, F1: 0.200 class: class_1, n_estimators: 500, max_depth: 5, learning_rate: 0.010, precision: 0.250, recall: 0.400, F1: 0.308 class: class_1, n_estimators: 500, max_depth: 5, learning_rate: 0.100, precision: 0.200, recall: 0.200, F1: 0.200 class: class_1, n_estimators: 500, max_depth: 5, learning_rate: 1.000, precision: 0.250, recall: 0.400, F1: 0.308 class: class_1, n_estimators: 500, max_depth: 8, learning_rate: 0.010, precision: 0.143, recall: 0.200, F1: 0.167 class: class_1, n_estimators: 500, max_depth: 8, learning_rate: 0.100, precision: 0.286, recall: 0.400, F1: 0.333 class: class_1, n_estimators: 500, max_depth: 8, learning_rate: 1.000, precision: 0.250, recall: 0.400, F1: 0.308 class: class_1, n_estimators: 1000, max_depth: 3, learning_rate: 0.010, precision: 0.333, recall: 0.400, F1: 0.364 class: class_1, n_estimators: 1000, max_depth: 3, learning_rate: 0.100, precision: 0.250, recall: 0.200, F1: 0.222 class: class_1, n_estimators: 1000, max_depth: 3, learning_rate: 1.000, precision: 0.200, recall: 0.200, F1: 0.200 class: class_1, n_estimators: 1000, max_depth: 5, learning_rate: 0.010, precision: 0.333, recall: 0.400, F1: 0.364 class: class_1, n_estimators: 1000, max_depth: 5, learning_rate: 0.100, precision: 0.200, recall: 0.200, F1: 0.200 class: class_1, n_estimators: 1000, max_depth: 5, learning_rate: 1.000, precision: 0.250, recall: 0.400, F1: 0.308 class: class_1, n_estimators: 1000, max_depth: 8, learning_rate: 0.010, precision: 0.286, recall: 0.400, F1: 0.333 class: class_1, n_estimators: 1000, max_depth: 8, learning_rate: 0.100, precision: 0.333, recall: 0.400, F1: 0.364 class: class_1, n_estimators: 1000, max_depth: 8, learning_rate: 1.000, precision: 0.111, recall: 0.200, F1: 0.143 class: class_2, n_estimators: 500, max_depth: 3, learning_rate: 0.010, precision: 0.133, recall: 0.222, F1: 0.167 class: class_2, n_estimators: 500, max_depth: 3, learning_rate: 0.100, precision: 0.333, recall: 0.222, F1: 0.267 class: class_2, n_estimators: 500, max_depth: 3, learning_rate: 1.000, precision: 0.333, recall: 0.333, F1: 0.333 class: class_2, n_estimators: 500, max_depth: 5, learning_rate: 0.010, precision: 0.222, recall: 0.222, F1: 0.222 class: class_2, n_estimators: 500, max_depth: 5, learning_rate: 0.100, precision: 0.286, recall: 0.222, F1: 0.250 class: class_2, n_estimators: 500, max_depth: 5, learning_rate: 1.000, precision: 0.250, recall: 0.111, F1: 0.154 class: class_2, n_estimators: 500, max_depth: 8, learning_rate: 0.010, precision: 0.167, recall: 0.111, F1: 0.133 class: class_2, n_estimators: 500, max_depth: 8, learning_rate: 0.100, precision: 0.400, recall: 0.222, F1: 0.286 class: class_2, n_estimators: 500, max_depth: 8, learning_rate: 1.000, precision: 0.200, recall: 0.222, F1: 0.211 class: class_2, n_estimators: 1000, max_depth: 3, learning_rate: 0.010, precision: 0.143, recall: 0.111, F1: 0.125 class: class_2, n_estimators: 1000, max_depth: 3, learning_rate: 0.100, precision: 0.333, recall: 0.222, F1: 0.267 class: class_2, n_estimators: 1000, max_depth: 3, learning_rate: 1.000, precision: 0.333, recall: 0.333, F1: 0.333 class: class_2, n_estimators: 1000, max_depth: 5, learning_rate: 0.010, precision: 0.286, recall: 0.222, F1: 0.250 class: class_2, n_estimators: 1000, max_depth: 5, learning_rate: 0.100, precision: 0.286, recall: 0.222, F1: 0.250 class: class_2, n_estimators: 1000, max_depth: 5, learning_rate: 1.000, precision: 0.250, recall: 0.111, F1: 0.154 class: class_2, n_estimators: 1000, max_depth: 8, learning_rate: 0.010, precision: 0.200, recall: 0.111, F1: 0.143 class: class_2, n_estimators: 1000, max_depth: 8, learning_rate: 0.100, precision: 0.333, recall: 0.222, F1: 0.267 class: class_2, n_estimators: 1000, max_depth: 8, learning_rate: 1.000, precision: 0.125, recall: 0.111, F1: 0.118 class: class_3, n_estimators: 500, max_depth: 3, learning_rate: 0.010, precision: 0.611, recall: 0.721, F1: 0.662 class: class_3, n_estimators: 500, max_depth: 3, learning_rate: 0.100, precision: 0.750, recall: 0.689, F1: 0.718 class: class_3, n_estimators: 500, max_depth: 3, learning_rate: 1.000, precision: 0.646, recall: 0.689, F1: 0.667 class: class_3, n_estimators: 500, max_depth: 5, learning_rate: 0.010, precision: 0.682, recall: 0.738, F1: 0.709 class: class_3, n_estimators: 500, max_depth: 5, learning_rate: 0.100, precision: 0.726, recall: 0.738, F1: 0.732 class: class_3, n_estimators: 500, max_depth: 5, learning_rate: 1.000, precision: 0.700, recall: 0.689, F1: 0.694 class: class_3, n_estimators: 500, max_depth: 8, learning_rate: 0.010, precision: 0.710, recall: 0.721, F1: 0.715 class: class_3, n_estimators: 500, max_depth: 8, learning_rate: 0.100, precision: 0.692, recall: 0.738, F1: 0.714 class: class_3, n_estimators: 500, max_depth: 8, learning_rate: 1.000, precision: 0.714, recall: 0.656, F1: 0.684 class: class_3, n_estimators: 1000, max_depth: 3, learning_rate: 0.010, precision: 0.651, recall: 0.672, F1: 0.661 class: class_3, n_estimators: 1000, max_depth: 3, learning_rate: 0.100, precision: 0.724, recall: 0.689, F1: 0.706 class: class_3, n_estimators: 1000, max_depth: 3, learning_rate: 1.000, precision: 0.667, recall: 0.689, F1: 0.677 class: class_3, n_estimators: 1000, max_depth: 5, learning_rate: 0.010, precision: 0.700, recall: 0.689, F1: 0.694 class: class_3, n_estimators: 1000, max_depth: 5, learning_rate: 0.100, precision: 0.717, recall: 0.705, F1: 0.711 class: class_3, n_estimators: 1000, max_depth: 5, learning_rate: 1.000, precision: 0.700, recall: 0.689, F1: 0.694 class: class_3, n_estimators: 1000, max_depth: 8, learning_rate: 0.010, precision: 0.703, recall: 0.738, F1: 0.720 class: class_3, n_estimators: 1000, max_depth: 8, learning_rate: 0.100, precision: 0.717, recall: 0.705, F1: 0.711 class: class_3, n_estimators: 1000, max_depth: 8, learning_rate: 1.000, precision: 0.687, recall: 0.754, F1: 0.719 class: class_4, n_estimators: 500, max_depth: 3, learning_rate: 0.010, precision: 0.276, recall: 0.444, F1: 0.340 class: class_4, n_estimators: 500, max_depth: 3, learning_rate: 0.100, precision: 0.667, recall: 0.333, F1: 0.444 class: class_4, n_estimators: 500, max_depth: 3, learning_rate: 1.000, precision: 0.333, recall: 0.278, F1: 0.303 class: class_4, n_estimators: 500, max_depth: 5, learning_rate: 0.010, precision: 0.467, recall: 0.389, F1: 0.424 class: class_4, n_estimators: 500, max_depth: 5, learning_rate: 0.100, precision: 0.538, recall: 0.389, F1: 0.452 class: class_4, n_estimators: 500, max_depth: 5, learning_rate: 1.000, precision: 0.412, recall: 0.389, F1: 0.400 class: class_4, n_estimators: 500, max_depth: 8, learning_rate: 0.010, precision: 0.471, recall: 0.444, F1: 0.457 class: class_4, n_estimators: 500, max_depth: 8, learning_rate: 0.100, precision: 0.615, recall: 0.444, F1: 0.516 class: class_4, n_estimators: 500, max_depth: 8, learning_rate: 1.000, precision: 0.444, recall: 0.444, F1: 0.444 class: class_4, n_estimators: 1000, max_depth: 3, learning_rate: 0.010, precision: 0.368, recall: 0.389, F1: 0.378 class: class_4, n_estimators: 1000, max_depth: 3, learning_rate: 0.100, precision: 0.600, recall: 0.333, F1: 0.429 class: class_4, n_estimators: 1000, max_depth: 3, learning_rate: 1.000, precision: 0.333, recall: 0.278, F1: 0.303 class: class_4, n_estimators: 1000, max_depth: 5, learning_rate: 0.010, precision: 0.571, recall: 0.444, F1: 0.500 class: class_4, n_estimators: 1000, max_depth: 5, learning_rate: 0.100, precision: 0.500, recall: 0.389, F1: 0.438 class: class_4, n_estimators: 1000, max_depth: 5, learning_rate: 1.000, precision: 0.467, recall: 0.389, F1: 0.424 class: class_4, n_estimators: 1000, max_depth: 8, learning_rate: 0.010, precision: 0.500, recall: 0.389, F1: 0.438 class: class_4, n_estimators: 1000, max_depth: 8, learning_rate: 0.100, precision: 0.636, recall: 0.389, F1: 0.483 class: class_4, n_estimators: 1000, max_depth: 8, learning_rate: 1.000, precision: 0.353, recall: 0.333, F1: 0.343 class: class_5, n_estimators: 500, max_depth: 3, learning_rate: 0.010, precision: 0.562, recall: 0.726, F1: 0.634 class: class_5, n_estimators: 500, max_depth: 3, learning_rate: 0.100, precision: 0.685, recall: 0.597, F1: 0.638 class: class_5, n_estimators: 500, max_depth: 3, learning_rate: 1.000, precision: 0.727, recall: 0.645, F1: 0.684 class: class_5, n_estimators: 500, max_depth: 5, learning_rate: 0.010, precision: 0.672, recall: 0.661, F1: 0.667 class: class_5, n_estimators: 500, max_depth: 5, learning_rate: 0.100, precision: 0.778, recall: 0.565, F1: 0.654 class: class_5, n_estimators: 500, max_depth: 5, learning_rate: 1.000, precision: 0.755, recall: 0.645, F1: 0.696 class: class_5, n_estimators: 500, max_depth: 8, learning_rate: 0.010, precision: 0.745, recall: 0.661, F1: 0.701 class: class_5, n_estimators: 500, max_depth: 8, learning_rate: 0.100, precision: 0.769, recall: 0.645, F1: 0.702 class: class_5, n_estimators: 500, max_depth: 8, learning_rate: 1.000, precision: 0.712, recall: 0.597, F1: 0.649 class: class_5, n_estimators: 1000, max_depth: 3, learning_rate: 0.010, precision: 0.620, recall: 0.710, F1: 0.662 class: class_5, n_estimators: 1000, max_depth: 3, learning_rate: 0.100, precision: 0.702, recall: 0.645, F1: 0.672 class: class_5, n_estimators: 1000, max_depth: 3, learning_rate: 1.000, precision: 0.722, recall: 0.629, F1: 0.672 class: class_5, n_estimators: 1000, max_depth: 5, learning_rate: 0.010, precision: 0.714, recall: 0.645, F1: 0.678 class: class_5, n_estimators: 1000, max_depth: 5, learning_rate: 0.100, precision: 0.706, recall: 0.581, F1: 0.637 class: class_5, n_estimators: 1000, max_depth: 5, learning_rate: 1.000, precision: 0.690, recall: 0.645, F1: 0.667 class: class_5, n_estimators: 1000, max_depth: 8, learning_rate: 0.010, precision: 0.765, recall: 0.629, F1: 0.690 class: class_5, n_estimators: 1000, max_depth: 8, learning_rate: 0.100, precision: 0.755, recall: 0.645, F1: 0.696 class: class_5, n_estimators: 1000, max_depth: 8, learning_rate: 1.000, precision: 0.732, recall: 0.661, F1: 0.695 class: class_6, n_estimators: 500, max_depth: 3, learning_rate: 0.010, precision: 0.206, recall: 0.438, F1: 0.280 class: class_6, n_estimators: 500, max_depth: 3, learning_rate: 0.100, precision: 0.462, recall: 0.375, F1: 0.414 class: class_6, n_estimators: 500, max_depth: 3, learning_rate: 1.000, precision: 0.467, recall: 0.438, F1: 0.452 class: class_6, n_estimators: 500, max_depth: 5, learning_rate: 0.010, precision: 0.333, recall: 0.438, F1: 0.378 class: class_6, n_estimators: 500, max_depth: 5, learning_rate: 0.100, precision: 0.533, recall: 0.500, F1: 0.516 class: class_6, n_estimators: 500, max_depth: 5, learning_rate: 1.000, precision: 0.400, recall: 0.375, F1: 0.387 class: class_6, n_estimators: 500, max_depth: 8, learning_rate: 0.010, precision: 0.412, recall: 0.438, F1: 0.424 class: class_6, n_estimators: 500, max_depth: 8, learning_rate: 0.100, precision: 0.583, recall: 0.438, F1: 0.500 class: class_6, n_estimators: 500, max_depth: 8, learning_rate: 1.000, precision: 0.320, recall: 0.500, F1: 0.390 class: class_6, n_estimators: 1000, max_depth: 3, learning_rate: 0.010, precision: 0.280, recall: 0.438, F1: 0.341 class: class_6, n_estimators: 1000, max_depth: 3, learning_rate: 0.100, precision: 0.500, recall: 0.375, F1: 0.429 class: class_6, n_estimators: 1000, max_depth: 3, learning_rate: 1.000, precision: 0.467, recall: 0.438, F1: 0.452 class: class_6, n_estimators: 1000, max_depth: 5, learning_rate: 0.010, precision: 0.438, recall: 0.438, F1: 0.438 class: class_6, n_estimators: 1000, max_depth: 5, learning_rate: 0.100, precision: 0.571, recall: 0.500, F1: 0.533 class: class_6, n_estimators: 1000, max_depth: 5, learning_rate: 1.000, precision: 0.429, recall: 0.375, F1: 0.400 class: class_6, n_estimators: 1000, max_depth: 8, learning_rate: 0.010, precision: 0.438, recall: 0.438, F1: 0.438 class: class_6, n_estimators: 1000, max_depth: 8, learning_rate: 0.100, precision: 0.571, recall: 0.500, F1: 0.533 class: class_6, n_estimators: 1000, max_depth: 8, learning_rate: 1.000, precision: 0.308, recall: 0.500, F1: 0.381 class: class_7, n_estimators: 500, max_depth: 3, learning_rate: 0.010, precision: 0.431, recall: 0.667, F1: 0.524 class: class_7, n_estimators: 500, max_depth: 3, learning_rate: 0.100, precision: 0.640, recall: 0.485, F1: 0.552 class: class_7, n_estimators: 500, max_depth: 3, learning_rate: 1.000, precision: 0.621, recall: 0.545, F1: 0.581 class: class_7, n_estimators: 500, max_depth: 5, learning_rate: 0.010, precision: 0.469, recall: 0.455, F1: 0.462 class: class_7, n_estimators: 500, max_depth: 5, learning_rate: 0.100, precision: 0.556, recall: 0.455, F1: 0.500 class: class_7, n_estimators: 500, max_depth: 5, learning_rate: 1.000, precision: 0.750, recall: 0.636, F1: 0.689 class: class_7, n_estimators: 500, max_depth: 8, learning_rate: 0.010, precision: 0.567, recall: 0.515, F1: 0.540 class: class_7, n_estimators: 500, max_depth: 8, learning_rate: 0.100, precision: 0.593, recall: 0.485, F1: 0.533 class: class_7, n_estimators: 500, max_depth: 8, learning_rate: 1.000, precision: 0.533, recall: 0.485, F1: 0.508 class: class_7, n_estimators: 1000, max_depth: 3, learning_rate: 0.010, precision: 0.500, recall: 0.606, F1: 0.548 class: class_7, n_estimators: 1000, max_depth: 3, learning_rate: 0.100, precision: 0.667, recall: 0.545, F1: 0.600 class: class_7, n_estimators: 1000, max_depth: 3, learning_rate: 1.000, precision: 0.633, recall: 0.576, F1: 0.603 class: class_7, n_estimators: 1000, max_depth: 5, learning_rate: 0.010, precision: 0.600, recall: 0.455, F1: 0.517 class: class_7, n_estimators: 1000, max_depth: 5, learning_rate: 0.100, precision: 0.583, recall: 0.424, F1: 0.491 class: class_7, n_estimators: 1000, max_depth: 5, learning_rate: 1.000, precision: 0.724, recall: 0.636, F1: 0.677 class: class_7, n_estimators: 1000, max_depth: 8, learning_rate: 0.010, precision: 0.552, recall: 0.485, F1: 0.516 class: class_7, n_estimators: 1000, max_depth: 8, learning_rate: 0.100, precision: 0.615, recall: 0.485, F1: 0.542 class: class_7, n_estimators: 1000, max_depth: 8, learning_rate: 1.000, precision: 0.562, recall: 0.545, F1: 0.554 class: class_8, n_estimators: 500, max_depth: 3, learning_rate: 0.010, precision: 0.549, recall: 0.667, F1: 0.602 class: class_8, n_estimators: 500, max_depth: 3, learning_rate: 0.100, precision: 0.692, recall: 0.643, F1: 0.667 class: class_8, n_estimators: 500, max_depth: 3, learning_rate: 1.000, precision: 0.711, recall: 0.643, F1: 0.675 class: class_8, n_estimators: 500, max_depth: 5, learning_rate: 0.010, precision: 0.658, recall: 0.595, F1: 0.625 class: class_8, n_estimators: 500, max_depth: 5, learning_rate: 0.100, precision: 0.778, recall: 0.667, F1: 0.718 class: class_8, n_estimators: 500, max_depth: 5, learning_rate: 1.000, precision: 0.682, recall: 0.714, F1: 0.698 class: class_8, n_estimators: 500, max_depth: 8, learning_rate: 0.010, precision: 0.649, recall: 0.571, F1: 0.608 class: class_8, n_estimators: 500, max_depth: 8, learning_rate: 0.100, precision: 0.706, recall: 0.571, F1: 0.632 class: class_8, n_estimators: 500, max_depth: 8, learning_rate: 1.000, precision: 0.718, recall: 0.667, F1: 0.691 class: class_8, n_estimators: 1000, max_depth: 3, learning_rate: 0.010, precision: 0.625, recall: 0.595, F1: 0.610 class: class_8, n_estimators: 1000, max_depth: 3, learning_rate: 0.100, precision: 0.683, recall: 0.667, F1: 0.675 class: class_8, n_estimators: 1000, max_depth: 3, learning_rate: 1.000, precision: 0.711, recall: 0.643, F1: 0.675 class: class_8, n_estimators: 1000, max_depth: 5, learning_rate: 0.010, precision: 0.657, recall: 0.548, F1: 0.597 class: class_8, n_estimators: 1000, max_depth: 5, learning_rate: 0.100, precision: 0.737, recall: 0.667, F1: 0.700 class: class_8, n_estimators: 1000, max_depth: 5, learning_rate: 1.000, precision: 0.643, recall: 0.643, F1: 0.643 class: class_8, n_estimators: 1000, max_depth: 8, learning_rate: 0.010, precision: 0.667, recall: 0.524, F1: 0.587 class: class_8, n_estimators: 1000, max_depth: 8, learning_rate: 0.100, precision: 0.684, recall: 0.619, F1: 0.650 class: class_8, n_estimators: 1000, max_depth: 8, learning_rate: 1.000, precision: 0.684, recall: 0.619, F1: 0.650 class: class_9, n_estimators: 500, max_depth: 3, learning_rate: 0.010, precision: 0.375, recall: 1.000, F1: 0.545 class: class_9, n_estimators: 500, max_depth: 3, learning_rate: 0.100, precision: 0.500, recall: 0.667, F1: 0.571 class: class_9, n_estimators: 500, max_depth: 3, learning_rate: 1.000, precision: 0.600, recall: 1.000, F1: 0.750 class: class_9, n_estimators: 500, max_depth: 5, learning_rate: 0.010, precision: 0.375, recall: 1.000, F1: 0.545 class: class_9, n_estimators: 500, max_depth: 5, learning_rate: 0.100, precision: 0.600, recall: 1.000, F1: 0.750 class: class_9, n_estimators: 500, max_depth: 5, learning_rate: 1.000, precision: 0.500, recall: 1.000, F1: 0.667 class: class_9, n_estimators: 500, max_depth: 8, learning_rate: 0.010, precision: 0.250, recall: 0.667, F1: 0.364 class: class_9, n_estimators: 500, max_depth: 8, learning_rate: 0.100, precision: 0.333, recall: 0.667, F1: 0.444 class: class_9, n_estimators: 500, max_depth: 8, learning_rate: 1.000, precision: 0.429, recall: 1.000, F1: 0.600 class: class_9, n_estimators: 1000, max_depth: 3, learning_rate: 0.010, precision: 0.375, recall: 1.000, F1: 0.545 class: class_9, n_estimators: 1000, max_depth: 3, learning_rate: 0.100, precision: 0.500, recall: 0.667, F1: 0.571 class: class_9, n_estimators: 1000, max_depth: 3, learning_rate: 1.000, precision: 0.600, recall: 1.000, F1: 0.750 class: class_9, n_estimators: 1000, max_depth: 5, learning_rate: 0.010, precision: 0.375, recall: 1.000, F1: 0.545 class: class_9, n_estimators: 1000, max_depth: 5, learning_rate: 0.100, precision: 0.500, recall: 1.000, F1: 0.667 class: class_9, n_estimators: 1000, max_depth: 5, learning_rate: 1.000, precision: 0.375, recall: 1.000, F1: 0.545 class: class_9, n_estimators: 1000, max_depth: 8, learning_rate: 0.010, precision: 0.286, recall: 0.667, F1: 0.400 class: class_9, n_estimators: 1000, max_depth: 8, learning_rate: 0.100, precision: 0.333, recall: 0.667, F1: 0.444 class: class_9, n_estimators: 1000, max_depth: 8, learning_rate: 1.000, precision: 0.333, recall: 0.667, F1: 0.444
forest = RandomForestClassifier(n_estimators = 1000)
clf_forest = BRClassifier(forest)
clf_forest.fit(X_ml, y_ml)
class_1 {'Precision': 0.375, 'Recall': 0.545, 'F1': 0.444}
class_2 {'Precision': 0.353, 'Recall': 0.333, 'F1': 0.343}
class_3 {'Precision': 0.736, 'Recall': 0.651, 'F1': 0.691}
class_4 {'Precision': 0.526, 'Recall': 0.5, 'F1': 0.513}
class_5 {'Precision': 0.746, 'Recall': 0.674, 'F1': 0.708}
class_6 {'Precision': 0.556, 'Recall': 0.455, 'F1': 0.5}
class_7 {'Precision': 0.638, 'Recall': 0.607, 'F1': 0.622}
class_8 {'Precision': 0.61, 'Recall': 0.51, 'F1': 0.556}
class_9 {'Precision': 0.667, 'Recall': 0.615, 'F1': 0.64}
9 Classes Overall Precision: 0.652, Recall: 0.589, F1 score: 0.619
# Build Classifier Chains model using random forest, since it has good record for F1 score.
def ClassifierChains(X, y):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25,random_state=1,shuffle = True)
for label in y_train.columns:
clf = RandomForestClassifier(n_estimators = 1000)
# upsampling minority class using synthetic data
smt = SMOTE()
X_train_smt, y_train_smt = smt.fit_resample(X_train, y_train[label])
clf.fit(X_train_smt, y_train_smt)
y_pred = clf.predict(X_test)
X_train[label] = y_train[label]
X_test[label] = y_pred
print(label, f1_score(y_test[label], y_pred))
df_cc = ClassifierChains(X_ml, y_ml)
class_1 0.4 class_2 0.3636363636363636 class_3 0.7058823529411765 class_4 0.4931506849315069 class_5 0.6880000000000001 class_6 0.4556962025316455 class_7 0.5974025974025974 class_8 0.611111111111111 class_9 0.64
# Direct comparision between Approach I and Approach II:
# In case of binary classification, I removed pure A and pure B samples, who have all positive labels to match
# with the multi-label classification data set.
# Binary Classification with GBDT: F1 score 0.516:
# remove pure A and pure B samples
df_bc_9class = df_bc[(df_bc['B_ratio'] != 100) & (df_bc['B_ratio'] != 0)]
# prepare input feature X_2 and label y_2
X_2 = df_bc_9class.drop(['label'], axis = 1)
y_2 = df_bc_9class['label']
# Use best binary classification model GBDT
gbdt_2 = GradientBoostingClassifier(n_estimators = 1000, max_depth = 5, learning_rate = 0.1)
gbdt_clf_2 = BinaryClassification(gbdt_2)
gbdt_clf_2.fit(X_2, y_2)
<__main__.BinaryClassification at 0x104546a90>
gbdt_clf_2.scores_
{'Precision': 0.603, 'Recall': 0.457, 'F1': 0.52}
# Use Binary Relevance Method combined with RF for test data prediction:
# Load test set
test_ml= pd.read_csv('test_data.csv')
df_test = test_ml.copy()
# Drop the highly correlated features
df_test = df_test.drop(['formulaA_elements_GSbandgap','formulaA_elements_GSestBCClatcnt', 'formulaA_elements_GSvolume_pa',
'formulaA_elements_ICSDVolume','formulaA_elements_Column', 'formulaA_elements_Number',
'formulaA_elements_Row','formulaB_elements_GSestBCClatcnt', 'formulaB_elements_GSvolume_pa',
'formulaB_elements_ICSDVolume','formulaB_elements_Column', 'formulaB_elements_Number',
'formulaB_elements_Row', 'formulaA', 'formulaB'], axis = 1)
# Predict the stability vector for test set
def predict(X, y, test):
test_copy = test.copy()
# predict label for each class, thus there would be 9 predictions
for label in y.columns:
clf = RandomForestClassifier(n_estimators = 1000)
smt = SMOTE()
X_train_smt, y_train_smt = smt.fit_resample(X, y[label])
clf.fit(X_train_smt, y_train_smt)
test_copy[label] = clf.predict(test)
test_copy[label] = test_copy[label].astype(str)
return test_copy
test_pred = predict(X_ml, y_ml, df_test)
# Merge all 11 classes together into stabilityVec str format: '[1.0, 0.0, ......, 1.0]'
def submission_ml(df):
test_pred = df.copy()
test_pred['stabilityVec'] = test_pred[test_pred.columns[-9:]].apply(lambda x: ','.join(x.astype(np.float).astype(str)), axis = 1)
test_pred['stabilityVec'] = test_pred['stabilityVec'].apply(lambda x: '[' + '1.0,' + x + ',1.0' + ']')
sub = test_pred[['formulaA_elements_AtomicWeight','formulaB_elements_AtomicWeight', 'stabilityVec']]
df_test_submission = pd.merge(test_ml, sub, on=['formulaA_elements_AtomicWeight','formulaB_elements_AtomicWeight'], how = 'inner')
return df_test_submission
test_sub = submission_ml(test_pred)
test_sub.to_csv('test_submission_ml.csv')
print('There are %d of predictions between Binary Classification and Multi-label Classification are the same' %
(np.sum(test_sub['stabilityVec'] == df_test_submission ['stabilityVec'])))
There are 549 of predictions between Binary Classification and Multi-label Classification are the same